Skip to content

Commit b46bae4

Browse files
authored
CLN: More read_csv state (#59210)
* CLean up index methods * Remove unused try_parse_dates * Clean usecol and date processing * Clean clear buffer * remove some single use * Typing
1 parent b608ddb commit b46bae4

File tree

3 files changed

+84
-108
lines changed

3 files changed

+84
-108
lines changed

pandas/io/parsers/base_parser.py

+53-66
Original file line numberDiff line numberDiff line change
@@ -274,46 +274,34 @@ def _make_index(
274274
self, data, alldata, columns, indexnamerow: list[Scalar] | None = None
275275
) -> tuple[Index | None, Sequence[Hashable] | MultiIndex]:
276276
index: Index | None
277-
if not is_index_col(self.index_col) or not self.index_col:
278-
index = None
277+
if isinstance(self.index_col, list) and len(self.index_col):
278+
to_remove = []
279+
indexes = []
280+
for idx in self.index_col:
281+
if isinstance(idx, str):
282+
raise ValueError(f"Index {idx} invalid")
283+
to_remove.append(idx)
284+
indexes.append(alldata[idx])
285+
# remove index items from content and columns, don't pop in
286+
# loop
287+
for i in sorted(to_remove, reverse=True):
288+
alldata.pop(i)
289+
if not self._implicit_index:
290+
columns.pop(i)
291+
index = self._agg_index(indexes)
292+
293+
# add names for the index
294+
if indexnamerow:
295+
coffset = len(indexnamerow) - len(columns)
296+
index = index.set_names(indexnamerow[:coffset])
279297
else:
280-
simple_index = self._get_simple_index(alldata, columns)
281-
index = self._agg_index(simple_index)
282-
283-
# add names for the index
284-
if indexnamerow:
285-
coffset = len(indexnamerow) - len(columns)
286-
assert index is not None
287-
index = index.set_names(indexnamerow[:coffset])
298+
index = None
288299

289300
# maybe create a mi on the columns
290301
columns = self._maybe_make_multi_index_columns(columns, self.col_names)
291302

292303
return index, columns
293304

294-
@final
295-
def _get_simple_index(self, data, columns):
296-
def ix(col):
297-
if not isinstance(col, str):
298-
return col
299-
raise ValueError(f"Index {col} invalid")
300-
301-
to_remove = []
302-
index = []
303-
for idx in self.index_col:
304-
i = ix(idx)
305-
to_remove.append(i)
306-
index.append(data[i])
307-
308-
# remove index items from content and columns, don't pop in
309-
# loop
310-
for i in sorted(to_remove, reverse=True):
311-
data.pop(i)
312-
if not self._implicit_index:
313-
columns.pop(i)
314-
315-
return index
316-
317305
@final
318306
def _clean_mapping(self, mapping):
319307
"""converts col numbers to names"""
@@ -333,12 +321,13 @@ def _clean_mapping(self, mapping):
333321
return clean
334322

335323
@final
336-
def _agg_index(self, index, try_parse_dates: bool = True) -> Index:
324+
def _agg_index(self, index) -> Index:
337325
arrays = []
338326
converters = self._clean_mapping(self.converters)
327+
clean_dtypes = self._clean_mapping(self.dtype)
339328

340329
for i, arr in enumerate(index):
341-
if try_parse_dates and self._should_parse_dates(i):
330+
if self._should_parse_dates(i):
342331
arr = date_converter(
343332
arr,
344333
col=self.index_names[i] if self.index_names is not None else None,
@@ -364,8 +353,6 @@ def _agg_index(self, index, try_parse_dates: bool = True) -> Index:
364353
else:
365354
col_na_values, col_na_fvalues = set(), set()
366355

367-
clean_dtypes = self._clean_mapping(self.dtype)
368-
369356
cast_type = None
370357
index_converter = False
371358
if self.index_names is not None:
@@ -632,35 +619,6 @@ def _check_data_length(
632619
stacklevel=find_stack_level(),
633620
)
634621

635-
@overload
636-
def _evaluate_usecols(
637-
self,
638-
usecols: Callable[[Hashable], object],
639-
names: Iterable[Hashable],
640-
) -> set[int]: ...
641-
642-
@overload
643-
def _evaluate_usecols(
644-
self, usecols: SequenceT, names: Iterable[Hashable]
645-
) -> SequenceT: ...
646-
647-
@final
648-
def _evaluate_usecols(
649-
self,
650-
usecols: Callable[[Hashable], object] | SequenceT,
651-
names: Iterable[Hashable],
652-
) -> SequenceT | set[int]:
653-
"""
654-
Check whether or not the 'usecols' parameter
655-
is a callable. If so, enumerates the 'names'
656-
parameter and returns a set of indices for
657-
each entry in 'names' that evaluates to True.
658-
If not a callable, returns 'usecols'.
659-
"""
660-
if callable(usecols):
661-
return {i for i, name in enumerate(names) if usecols(name)}
662-
return usecols
663-
664622
@final
665623
def _validate_usecols_names(self, usecols: SequenceT, names: Sequence) -> SequenceT:
666624
"""
@@ -988,3 +946,32 @@ def _validate_usecols_arg(usecols):
988946

989947
return usecols, usecols_dtype
990948
return usecols, None
949+
950+
951+
@overload
952+
def evaluate_callable_usecols(
953+
usecols: Callable[[Hashable], object],
954+
names: Iterable[Hashable],
955+
) -> set[int]: ...
956+
957+
958+
@overload
959+
def evaluate_callable_usecols(
960+
usecols: SequenceT, names: Iterable[Hashable]
961+
) -> SequenceT: ...
962+
963+
964+
def evaluate_callable_usecols(
965+
usecols: Callable[[Hashable], object] | SequenceT,
966+
names: Iterable[Hashable],
967+
) -> SequenceT | set[int]:
968+
"""
969+
Check whether or not the 'usecols' parameter
970+
is a callable. If so, enumerates the 'names'
971+
parameter and returns a set of indices for
972+
each entry in 'names' that evaluates to True.
973+
If not a callable, returns 'usecols'.
974+
"""
975+
if callable(usecols):
976+
return {i for i, name in enumerate(names) if usecols(name)}
977+
return usecols

pandas/io/parsers/c_parser_wrapper.py

+22-27
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@
3131
ParserBase,
3232
ParserError,
3333
date_converter,
34+
evaluate_callable_usecols,
3435
is_index_col,
3536
validate_parse_dates_presence,
3637
)
@@ -133,7 +134,7 @@ def __init__(self, src: ReadCsvBuffer[str], **kwds) -> None:
133134
self.orig_names = self.names[:] # type: ignore[has-type]
134135

135136
if self.usecols:
136-
usecols = self._evaluate_usecols(self.usecols, self.orig_names)
137+
usecols = evaluate_callable_usecols(self.usecols, self.orig_names)
137138

138139
# GH 14671
139140
# assert for mypy, orig_names is List or None, None would error in issubset
@@ -256,8 +257,7 @@ def read(
256257
columns, self.col_names
257258
)
258259

259-
if self.usecols is not None:
260-
columns = self._filter_usecols(columns)
260+
columns = _filter_usecols(self.usecols, columns)
261261

262262
col_dict = {k: v for k, v in col_dict.items() if k in columns}
263263

@@ -290,13 +290,21 @@ def read(
290290
else:
291291
values = data.pop(self.index_col[i])
292292

293-
values = self._maybe_parse_dates(values, i, try_parse_dates=True)
293+
if self._should_parse_dates(i):
294+
values = date_converter(
295+
values,
296+
col=self.index_names[i]
297+
if self.index_names is not None
298+
else None,
299+
dayfirst=self.dayfirst,
300+
cache_dates=self.cache_dates,
301+
date_format=self.date_format,
302+
)
294303
arrays.append(values)
295304

296305
index = ensure_index_from_sequences(arrays)
297306

298-
if self.usecols is not None:
299-
names = self._filter_usecols(names)
307+
names = _filter_usecols(self.usecols, names)
300308

301309
names = dedup_names(names, is_potential_multi_index(names, self.index_col))
302310

@@ -320,8 +328,7 @@ def read(
320328
names = list(self.orig_names)
321329
names = dedup_names(names, is_potential_multi_index(names, self.index_col))
322330

323-
if self.usecols is not None:
324-
names = self._filter_usecols(names)
331+
names = _filter_usecols(self.usecols, names)
325332

326333
# columns as list
327334
alldata = [x[1] for x in data_tups]
@@ -335,25 +342,13 @@ def read(
335342

336343
return index, column_names, date_data
337344

338-
def _filter_usecols(self, names: SequenceT) -> SequenceT | list[Hashable]:
339-
# hackish
340-
usecols = self._evaluate_usecols(self.usecols, names)
341-
if usecols is not None and len(names) != len(usecols):
342-
return [
343-
name for i, name in enumerate(names) if i in usecols or name in usecols
344-
]
345-
return names
346-
347-
def _maybe_parse_dates(self, values, index: int, try_parse_dates: bool = True):
348-
if try_parse_dates and self._should_parse_dates(index):
349-
values = date_converter(
350-
values,
351-
col=self.index_names[index] if self.index_names is not None else None,
352-
dayfirst=self.dayfirst,
353-
cache_dates=self.cache_dates,
354-
date_format=self.date_format,
355-
)
356-
return values
345+
346+
def _filter_usecols(usecols, names: SequenceT) -> SequenceT | list[Hashable]:
347+
# hackish
348+
usecols = evaluate_callable_usecols(usecols, names)
349+
if usecols is not None and len(names) != len(usecols):
350+
return [name for i, name in enumerate(names) if i in usecols or name in usecols]
351+
return names
357352

358353

359354
def _concatenate_chunks(

pandas/io/parsers/python_parser.py

+9-15
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,7 @@
5959
)
6060
from pandas.io.parsers.base_parser import (
6161
ParserBase,
62+
evaluate_callable_usecols,
6263
get_na_values,
6364
parser_defaults,
6465
validate_parse_dates_presence,
@@ -127,9 +128,8 @@ def __init__(self, f: ReadCsvBuffer[str] | list, **kwds) -> None:
127128
self.quoting = kwds["quoting"]
128129
self.skip_blank_lines = kwds["skip_blank_lines"]
129130

130-
self.has_index_names = False
131-
if "has_index_names" in kwds:
132-
self.has_index_names = kwds["has_index_names"]
131+
# Passed from read_excel
132+
self.has_index_names = kwds.get("has_index_names", False)
133133

134134
self.thousands = kwds["thousands"]
135135
self.decimal = kwds["decimal"]
@@ -299,9 +299,10 @@ def read(
299299
return index, conv_columns, col_dict
300300

301301
# handle new style for names in index
302-
count_empty_content_vals = count_empty_vals(content[0])
303302
indexnamerow = None
304-
if self.has_index_names and count_empty_content_vals == len(columns):
303+
if self.has_index_names and sum(
304+
int(v == "" or v is None) for v in content[0]
305+
) == len(columns):
305306
indexnamerow = content[0]
306307
content = content[1:]
307308

@@ -605,7 +606,7 @@ def _infer_columns(
605606
# serve as the 'line' for parsing
606607
if have_mi_columns and hr > 0:
607608
if clear_buffer:
608-
self._clear_buffer()
609+
self.buf.clear()
609610
columns.append([None] * len(columns[-1]))
610611
return columns, num_original_columns, unnamed_cols
611612

@@ -687,7 +688,7 @@ def _infer_columns(
687688
num_original_columns = len(this_columns)
688689

689690
if clear_buffer:
690-
self._clear_buffer()
691+
self.buf.clear()
691692

692693
first_line: list[Scalar] | None
693694
if names is not None:
@@ -774,7 +775,7 @@ def _handle_usecols(
774775
col_indices: set[int] | list[int]
775776
if self.usecols is not None:
776777
if callable(self.usecols):
777-
col_indices = self._evaluate_usecols(self.usecols, usecols_key)
778+
col_indices = evaluate_callable_usecols(self.usecols, usecols_key)
778779
elif any(isinstance(u, str) for u in self.usecols):
779780
if len(columns) > 1:
780781
raise ValueError(
@@ -1094,9 +1095,6 @@ def _check_decimal(self, lines: list[list[Scalar]]) -> list[list[Scalar]]:
10941095
lines=lines, search=self.decimal, replace="."
10951096
)
10961097

1097-
def _clear_buffer(self) -> None:
1098-
self.buf = []
1099-
11001098
def _get_index_name(
11011099
self,
11021100
) -> tuple[Sequence[Hashable] | None, list[Hashable], list[Hashable]]:
@@ -1526,10 +1524,6 @@ def _remove_empty_lines(self, lines: list[list[T]]) -> list[list[T]]:
15261524
]
15271525

15281526

1529-
def count_empty_vals(vals) -> int:
1530-
return sum(1 for v in vals if v == "" or v is None)
1531-
1532-
15331527
def _validate_skipfooter_arg(skipfooter: int) -> int:
15341528
"""
15351529
Validate the 'skipfooter' parameter.

0 commit comments

Comments
 (0)