Skip to content

Commit 4558e7d

Browse files
authored
Typ parts of python parser (pandas-dev#45015)
* TYP: Type python parser * Fix bug * Fix assignment issue * Adress conflicts * Remove unnecessary changes * Adjust
1 parent de6b11d commit 4558e7d

File tree

3 files changed

+54
-33
lines changed

3 files changed

+54
-33
lines changed

pandas/io/parsers/base_parser.py

+12-11
Original file line numberDiff line numberDiff line change
@@ -303,9 +303,7 @@ def _extract_multi_indexer_columns(
303303

304304
# clean the index_names
305305
index_names = header.pop(-1)
306-
index_names, _, _ = self._clean_index_names(
307-
index_names, self.index_col, self.unnamed_cols
308-
)
306+
index_names, _, _ = self._clean_index_names(index_names, self.index_col)
309307

310308
# extract the columns
311309
field_count = len(header[0])
@@ -381,21 +379,24 @@ def _maybe_make_multi_index_columns(
381379
return columns
382380

383381
@final
384-
def _make_index(self, data, alldata, columns, indexnamerow=False):
382+
def _make_index(
383+
self, data, alldata, columns, indexnamerow=False
384+
) -> tuple[Index | None, Sequence[Hashable] | MultiIndex]:
385+
index: Index | None
385386
if not is_index_col(self.index_col) or not self.index_col:
386387
index = None
387388

388389
elif not self._has_complex_date_col:
389-
index = self._get_simple_index(alldata, columns)
390-
index = self._agg_index(index)
390+
simple_index = self._get_simple_index(alldata, columns)
391+
index = self._agg_index(simple_index)
391392
elif self._has_complex_date_col:
392393
if not self._name_processed:
393394
(self.index_names, _, self.index_col) = self._clean_index_names(
394-
list(columns), self.index_col, self.unnamed_cols
395+
list(columns), self.index_col
395396
)
396397
self._name_processed = True
397-
index = self._get_complex_date_index(data, columns)
398-
index = self._agg_index(index, try_parse_dates=False)
398+
date_index = self._get_complex_date_index(data, columns)
399+
index = self._agg_index(date_index, try_parse_dates=False)
399400

400401
# add names for the index
401402
if indexnamerow:
@@ -966,7 +967,7 @@ def _validate_usecols_arg(self, usecols):
966967
return usecols, usecols_dtype
967968
return usecols, None
968969

969-
def _clean_index_names(self, columns, index_col, unnamed_cols):
970+
def _clean_index_names(self, columns, index_col):
970971
if not is_index_col(index_col):
971972
return None, columns, index_col
972973

@@ -998,7 +999,7 @@ def _clean_index_names(self, columns, index_col, unnamed_cols):
998999

9991000
# Only clean index names that were placeholders.
10001001
for i, name in enumerate(index_names):
1001-
if isinstance(name, str) and name in unnamed_cols:
1002+
if isinstance(name, str) and name in self.unnamed_cols:
10021003
index_names[i] = None
10031004

10041005
return index_names, columns, index_col

pandas/io/parsers/c_parser_wrapper.py

+11-8
Original file line numberDiff line numberDiff line change
@@ -172,7 +172,6 @@ def __init__(self, src: ReadCsvBuffer[str], **kwds):
172172
self.names, # type: ignore[has-type]
173173
# error: Cannot determine type of 'index_col'
174174
self.index_col, # type: ignore[has-type]
175-
self.unnamed_cols,
176175
)
177176

178177
if self.index_names is None:
@@ -220,6 +219,8 @@ def read(
220219
Sequence[Hashable] | MultiIndex,
221220
Mapping[Hashable, ArrayLike],
222221
]:
222+
index: Index | MultiIndex | None
223+
column_names: Sequence[Hashable] | MultiIndex
223224
try:
224225
if self.low_memory:
225226
chunks = self._reader.read_low_memory(nrows)
@@ -284,7 +285,12 @@ def read(
284285
data_tups = sorted(data.items())
285286
data = {k: v for k, (i, v) in zip(names, data_tups)}
286287

287-
names, date_data = self._do_date_conversions(names, data)
288+
column_names, date_data = self._do_date_conversions(names, data)
289+
290+
# maybe create a mi on the columns
291+
column_names = self._maybe_make_multi_index_columns(
292+
column_names, self.col_names
293+
)
288294

289295
else:
290296
# rename dict keys
@@ -308,12 +314,9 @@ def read(
308314
data = {k: v for k, (i, v) in zip(names, data_tups)}
309315

310316
names, date_data = self._do_date_conversions(names, data)
311-
index, names = self._make_index(date_data, alldata, names)
312-
313-
# maybe create a mi on the columns
314-
conv_names = self._maybe_make_multi_index_columns(names, self.col_names)
317+
index, column_names = self._make_index(date_data, alldata, names)
315318

316-
return index, conv_names, date_data
319+
return index, column_names, date_data
317320

318321
def _filter_usecols(self, names: Sequence[Hashable]) -> Sequence[Hashable]:
319322
# hackish
@@ -330,7 +333,7 @@ def _get_index_names(self):
330333

331334
if self._reader.leading_cols == 0 and self.index_col is not None:
332335
(idx_names, names, self.index_col) = self._clean_index_names(
333-
names, self.index_col, self.unnamed_cols
336+
names, self.index_col
334337
)
335338

336339
return names, idx_names

pandas/io/parsers/python_parser.py

+31-14
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
DefaultDict,
1414
Hashable,
1515
Iterator,
16+
List,
1617
Literal,
1718
Mapping,
1819
Sequence,
@@ -37,6 +38,11 @@
3738
from pandas.core.dtypes.common import is_integer
3839
from pandas.core.dtypes.inference import is_dict_like
3940

41+
from pandas import (
42+
Index,
43+
MultiIndex,
44+
)
45+
4046
from pandas.io.parsers.base_parser import (
4147
ParserBase,
4248
parser_defaults,
@@ -167,7 +173,7 @@ def __init__(self, f: ReadCsvBuffer[str] | list, **kwds):
167173
)
168174
self.num = re.compile(regex)
169175

170-
def _make_reader(self, f) -> None:
176+
def _make_reader(self, f: IO[str] | ReadCsvBuffer[str]) -> None:
171177
sep = self.delimiter
172178

173179
if sep is None or len(sep) == 1:
@@ -198,10 +204,11 @@ class MyDialect(csv.Dialect):
198204
self.pos += 1
199205
line = f.readline()
200206
lines = self._check_comments([[line]])[0]
207+
lines_str = cast(List[str], lines)
201208

202209
# since `line` was a string, lines will be a list containing
203210
# only a single string
204-
line = lines[0]
211+
line = lines_str[0]
205212

206213
self.pos += 1
207214
self.line_pos += 1
@@ -233,7 +240,11 @@ def _read():
233240
# TextIOWrapper, mmap, None]")
234241
self.data = reader # type: ignore[assignment]
235242

236-
def read(self, rows: int | None = None):
243+
def read(
244+
self, rows: int | None = None
245+
) -> tuple[
246+
Index | None, Sequence[Hashable] | MultiIndex, Mapping[Hashable, ArrayLike]
247+
]:
237248
try:
238249
content = self._get_lines(rows)
239250
except StopIteration:
@@ -273,9 +284,11 @@ def read(self, rows: int | None = None):
273284
conv_data = self._convert_data(data)
274285
columns, conv_data = self._do_date_conversions(columns, conv_data)
275286

276-
index, columns = self._make_index(conv_data, alldata, columns, indexnamerow)
287+
index, result_columns = self._make_index(
288+
conv_data, alldata, columns, indexnamerow
289+
)
277290

278-
return index, columns, conv_data
291+
return index, result_columns, conv_data
279292

280293
def _exclude_implicit_index(
281294
self,
@@ -586,7 +599,7 @@ def _handle_usecols(
586599
self._col_indices = sorted(col_indices)
587600
return columns
588601

589-
def _buffered_line(self):
602+
def _buffered_line(self) -> list[Scalar]:
590603
"""
591604
Return a line from buffer, filling buffer if required.
592605
"""
@@ -878,7 +891,9 @@ def _clear_buffer(self) -> None:
878891

879892
_implicit_index = False
880893

881-
def _get_index_name(self, columns: list[Hashable]):
894+
def _get_index_name(
895+
self, columns: list[Hashable]
896+
) -> tuple[list[Hashable] | None, list[Hashable], list[Hashable]]:
882897
"""
883898
Try several cases to get lines:
884899
@@ -943,8 +958,8 @@ def _get_index_name(self, columns: list[Hashable]):
943958

944959
else:
945960
# Case 2
946-
(index_name, columns_, self.index_col) = self._clean_index_names(
947-
columns, self.index_col, self.unnamed_cols
961+
(index_name, _, self.index_col) = self._clean_index_names(
962+
columns, self.index_col
948963
)
949964

950965
return index_name, orig_names, columns
@@ -1036,7 +1051,7 @@ def _rows_to_cols(self, content: list[list[Scalar]]) -> list[np.ndarray]:
10361051
]
10371052
return zipped_content
10381053

1039-
def _get_lines(self, rows: int | None = None):
1054+
def _get_lines(self, rows: int | None = None) -> list[list[Scalar]]:
10401055
lines = self.buf
10411056
new_rows = None
10421057

@@ -1133,7 +1148,7 @@ class FixedWidthReader(abc.Iterator):
11331148

11341149
def __init__(
11351150
self,
1136-
f: IO[str],
1151+
f: IO[str] | ReadCsvBuffer[str],
11371152
colspecs: list[tuple[int, int]] | Literal["infer"],
11381153
delimiter: str | None,
11391154
comment: str | None,
@@ -1230,14 +1245,16 @@ def detect_colspecs(
12301245
return edge_pairs
12311246

12321247
def __next__(self) -> list[str]:
1248+
# Argument 1 to "next" has incompatible type "Union[IO[str],
1249+
# ReadCsvBuffer[str]]"; expected "SupportsNext[str]"
12331250
if self.buffer is not None:
12341251
try:
12351252
line = next(self.buffer)
12361253
except StopIteration:
12371254
self.buffer = None
1238-
line = next(self.f)
1255+
line = next(self.f) # type: ignore[arg-type]
12391256
else:
1240-
line = next(self.f)
1257+
line = next(self.f) # type: ignore[arg-type]
12411258
# Note: 'colspecs' is a sequence of half-open intervals.
12421259
return [line[fromm:to].strip(self.delimiter) for (fromm, to) in self.colspecs]
12431260

@@ -1254,7 +1271,7 @@ def __init__(self, f: ReadCsvBuffer[str], **kwds) -> None:
12541271
self.infer_nrows = kwds.pop("infer_nrows")
12551272
PythonParser.__init__(self, f, **kwds)
12561273

1257-
def _make_reader(self, f: IO[str]) -> None:
1274+
def _make_reader(self, f: IO[str] | ReadCsvBuffer[str]) -> None:
12581275
self.data = FixedWidthReader(
12591276
f,
12601277
self.colspecs,

0 commit comments

Comments
 (0)