Skip to content

Commit 19e5d35

Browse files
committed
REF: Unify _set_noconvert_dtype_columns for parsers
1 parent 309cf3a commit 19e5d35

File tree

4 files changed

+87
-95
lines changed

4 files changed

+87
-95
lines changed

pandas/io/parsers/base_parser.py

Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -172,6 +172,8 @@ def __init__(self, kwds):
172172

173173
self._first_chunk = True
174174

175+
self.usecols, self.usecols_dtype = self._validate_usecols_arg(kwds["usecols"])
176+
175177
self.handles: Optional[IOHandles] = None
176178

177179
def _open_handles(self, src: FilePathOrBuffer, kwds: Dict[str, Any]) -> None:
@@ -546,6 +548,65 @@ def _convert_to_ndarrays(
546548
print(f"Filled {na_count} NA values in column {c!s}")
547549
return result
548550

551+
def _set_noconvert_dtype_columns(self, col_indices, names):
552+
"""
553+
Set the columns that should not undergo dtype conversions.
554+
555+
Currently, any column that is involved with date parsing will not
556+
undergo such conversions.
557+
"""
558+
noconvert_columns = set()
559+
if self.usecols_dtype == "integer":
560+
# A set of integers will be converted to a list in
561+
# the correct order every single time.
562+
usecols = list(self.usecols)
563+
usecols.sort()
564+
elif callable(self.usecols) or self.usecols_dtype not in ("empty", None):
565+
# The names attribute should have the correct columns
566+
# in the proper order for indexing with parse_dates.
567+
usecols = col_indices
568+
else:
569+
# Usecols is empty.
570+
571+
# pandas\io\parsers.py:2030: error: Incompatible types in
572+
# assignment (expression has type "None", variable has type
573+
# "List[Any]") [assignment]
574+
usecols = None # type: ignore[assignment]
575+
576+
def _set(x):
577+
if usecols is not None and is_integer(x):
578+
x = usecols[x]
579+
580+
if not is_integer(x):
581+
x = col_indices[names.index(x)]
582+
583+
noconvert_columns.add(x)
584+
585+
if isinstance(self.parse_dates, list):
586+
for val in self.parse_dates:
587+
if isinstance(val, list):
588+
for k in val:
589+
_set(k)
590+
else:
591+
_set(val)
592+
593+
elif isinstance(self.parse_dates, dict):
594+
for val in self.parse_dates.values():
595+
if isinstance(val, list):
596+
for k in val:
597+
_set(k)
598+
else:
599+
_set(val)
600+
601+
elif self.parse_dates:
602+
if isinstance(self.index_col, list):
603+
for k in self.index_col:
604+
_set(k)
605+
elif self.index_col is not None:
606+
_set(self.index_col)
607+
608+
return noconvert_columns
609+
549610
def _infer_types(self, values, na_values, try_num_bool=True):
550611
"""
551612
Infer types of values, possibly casting

pandas/io/parsers/c_parser_wrapper.py

Lines changed: 5 additions & 55 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,6 @@
11
import pandas._libs.parsers as parsers
22
from pandas._typing import FilePathOrBuffer
33

4-
from pandas.core.dtypes.common import is_integer
5-
64
from pandas.core.indexes.api import ensure_index_from_sequences
75

86
from pandas.io.parsers.base_parser import ParserBase, is_index_col
@@ -19,7 +17,6 @@ def __init__(self, src: FilePathOrBuffer, **kwds):
1917
kwds["allow_leading_cols"] = self.index_col is not False
2018

2119
# GH20529, validate usecol arg before TextReader
22-
self.usecols, self.usecols_dtype = self._validate_usecols_arg(kwds["usecols"])
2320
kwds["usecols"] = self.usecols
2421

2522
# open handles
@@ -159,58 +156,11 @@ def _set_noconvert_columns(self):
159156
Currently, any column that is involved with date parsing will not
160157
undergo such conversions.
161158
"""
162-
names = self.orig_names
163-
if self.usecols_dtype == "integer":
164-
# A set of integers will be converted to a list in
165-
# the correct order every single time.
166-
usecols = list(self.usecols)
167-
usecols.sort()
168-
elif callable(self.usecols) or self.usecols_dtype not in ("empty", None):
169-
# The names attribute should have the correct columns
170-
# in the proper order for indexing with parse_dates.
171-
usecols = self.names[:]
172-
else:
173-
# Usecols is empty.
174-
175-
# pandas\io\parsers.py:2030: error: Incompatible types in
176-
# assignment (expression has type "None", variable has type
177-
# "List[Any]") [assignment]
178-
usecols = None # type: ignore[assignment]
179-
180-
def _set(x):
181-
if usecols is not None and is_integer(x):
182-
x = usecols[x]
183-
184-
if not is_integer(x):
185-
# assert for mypy, names is List or None, None would error when calling
186-
# .index()
187-
assert names is not None
188-
x = names.index(x)
189-
190-
self._reader.set_noconvert(x)
191-
192-
if isinstance(self.parse_dates, list):
193-
for val in self.parse_dates:
194-
if isinstance(val, list):
195-
for k in val:
196-
_set(k)
197-
else:
198-
_set(val)
199-
200-
elif isinstance(self.parse_dates, dict):
201-
for val in self.parse_dates.values():
202-
if isinstance(val, list):
203-
for k in val:
204-
_set(k)
205-
else:
206-
_set(val)
207-
208-
elif self.parse_dates:
209-
if isinstance(self.index_col, list):
210-
for k in self.index_col:
211-
_set(k)
212-
elif self.index_col is not None:
213-
_set(self.index_col)
159+
assert self.orig_names is not None
160+
col_indices = [self.orig_names.index(x) for x in self.names]
161+
noconvert_columns = self._set_noconvert_dtype_columns(col_indices, self.names)
162+
for col in noconvert_columns:
163+
self._reader.set_noconvert(col)
214164

215165
def set_error_bad_lines(self, status):
216166
self._reader.set_error_bad_lines(int(status))

pandas/io/parsers/python_parser.py

Lines changed: 3 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,6 @@ def __init__(self, f: Union[FilePathOrBuffer, List], **kwds):
5353
self.skipinitialspace = kwds["skipinitialspace"]
5454
self.lineterminator = kwds["lineterminator"]
5555
self.quoting = kwds["quoting"]
56-
self.usecols, _ = self._validate_usecols_arg(kwds["usecols"])
5756
self.skip_blank_lines = kwds["skip_blank_lines"]
5857

5958
self.warn_bad_lines = kwds["warn_bad_lines"]
@@ -137,7 +136,9 @@ def __init__(self, f: Union[FilePathOrBuffer, List], **kwds):
137136

138137
self._validate_parse_dates_presence(self.columns)
139138
if self.parse_dates:
140-
self._no_thousands_columns = self._set_no_thousands_columns()
139+
self._no_thousands_columns = self._set_noconvert_dtype_columns(
140+
self._col_indices, self.columns
141+
)
141142
else:
142143
self._no_thousands_columns = None
143144

@@ -155,44 +156,6 @@ def __init__(self, f: Union[FilePathOrBuffer, List], **kwds):
155156
)
156157
self.num = re.compile(regex)
157158

158-
def _set_no_thousands_columns(self):
159-
# Create a set of column ids that are not to be stripped of thousands
160-
# operators.
161-
noconvert_columns = set()
162-
163-
def _set(x):
164-
if is_integer(x):
165-
noconvert_columns.add(x)
166-
else:
167-
assert self._col_indices is not None
168-
col_indices = self._col_indices
169-
noconvert_columns.add(col_indices[self.columns.index(x)])
170-
171-
if isinstance(self.parse_dates, list):
172-
for val in self.parse_dates:
173-
if isinstance(val, list):
174-
for k in val:
175-
_set(k)
176-
else:
177-
_set(val)
178-
179-
elif isinstance(self.parse_dates, dict):
180-
for val in self.parse_dates.values():
181-
if isinstance(val, list):
182-
for k in val:
183-
_set(k)
184-
else:
185-
_set(val)
186-
187-
elif self.parse_dates:
188-
if isinstance(self.index_col, list):
189-
for k in self.index_col:
190-
_set(k)
191-
elif self.index_col is not None:
192-
_set(self.index_col)
193-
194-
return noconvert_columns
195-
196159
def _make_reader(self, f):
197160
sep = self.delimiter
198161

pandas/tests/io/parser/test_parse_dates.py

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1603,3 +1603,21 @@ def test_date_parser_and_names(all_parsers):
16031603
result = parser.read_csv(data, parse_dates=["B"], names=["B"])
16041604
expected = DataFrame({"B": ["y", "2"]}, index=["x", "1"])
16051605
tm.assert_frame_equal(result, expected)
1606+
1607+
1608+
def test_date_parser_usecols_thousands(all_parsers):
1609+
# GH#
1610+
data = """A,B,C
1611+
1,3,20-09-01-01
1612+
2,4,20-09-01-01
1613+
"""
1614+
1615+
parser = all_parsers
1616+
result = parser.read_csv(
1617+
StringIO(data),
1618+
parse_dates=[1],
1619+
usecols=[1, 2],
1620+
thousands="-",
1621+
)
1622+
expected = DataFrame({"B": [3, 4], "C": [Timestamp("20-09-2001 01:00:00")] * 2})
1623+
tm.assert_frame_equal(result, expected)

0 commit comments

Comments
 (0)