Skip to content

Commit bc3adf2

Browse files
authored
REF: Unify _set_noconvert_dtype_columns for parsers (#39365)
1 parent b8890eb commit bc3adf2

File tree

5 files changed

+100
-98
lines changed

5 files changed

+100
-98
lines changed

doc/source/whatsnew/v1.3.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -325,6 +325,7 @@ I/O
325325
- Bug in :func:`to_hdf` raising ``KeyError`` when trying to apply for subclasses of ``DataFrame`` or ``Series`` (:issue:`33748`)
326326
- Bug in :meth:`~HDFStore.put` raising a wrong ``TypeError`` when saving a DataFrame with non-string dtype (:issue:`34274`)
327327
- Bug in :func:`json_normalize` resulting in the first element of a generator object not being included in the returned ``DataFrame`` (:issue:`35923`)
328+
- Bug in :func:`read_csv` apllying thousands separator to date columns when column should be parsed for dates and ``usecols`` is specified for ``engine="python"`` (:issue:`39365`)
328329
- Bug in :func:`read_excel` forward filling :class:`MultiIndex` names with multiple header and index columns specified (:issue:`34673`)
329330
- :func:`read_excel` now respects :func:`set_option` (:issue:`34252`)
330331
- Bug in :func:`read_csv` not switching ``true_values`` and ``false_values`` for nullable ``boolean`` dtype (:issue:`34655`)

pandas/io/parsers/base_parser.py

+70
Original file line numberDiff line numberDiff line change
@@ -172,6 +172,8 @@ def __init__(self, kwds):
172172

173173
self._first_chunk = True
174174

175+
self.usecols, self.usecols_dtype = self._validate_usecols_arg(kwds["usecols"])
176+
175177
self.handles: Optional[IOHandles] = None
176178

177179
def _open_handles(self, src: FilePathOrBuffer, kwds: Dict[str, Any]) -> None:
@@ -546,6 +548,74 @@ def _convert_to_ndarrays(
546548
print(f"Filled {na_count} NA values in column {c!s}")
547549
return result
548550

551+
def _set_noconvert_dtype_columns(
552+
self, col_indices: List[int], names: List[Union[int, str]]
553+
) -> Set[int]:
554+
"""
555+
Set the columns that should not undergo dtype conversions.
556+
557+
Currently, any column that is involved with date parsing will not
558+
undergo such conversions. If usecols is specified, the positions of the columns
559+
not to cast is relative to the usecols not to all columns.
560+
561+
Parameters
562+
----------
563+
col_indices: The indices specifying order and positions of the columns
564+
names: The column names which order is corresponding with the order
565+
of col_indices
566+
567+
Returns
568+
-------
569+
A set of integers containing the positions of the columns not to convert.
570+
"""
571+
usecols: Optional[Union[List[int], List[str]]]
572+
noconvert_columns = set()
573+
if self.usecols_dtype == "integer":
574+
# A set of integers will be converted to a list in
575+
# the correct order every single time.
576+
usecols = sorted(self.usecols)
577+
elif callable(self.usecols) or self.usecols_dtype not in ("empty", None):
578+
# The names attribute should have the correct columns
579+
# in the proper order for indexing with parse_dates.
580+
usecols = col_indices
581+
else:
582+
# Usecols is empty.
583+
usecols = None
584+
585+
def _set(x) -> int:
586+
if usecols is not None and is_integer(x):
587+
x = usecols[x]
588+
589+
if not is_integer(x):
590+
x = col_indices[names.index(x)]
591+
592+
return x
593+
594+
if isinstance(self.parse_dates, list):
595+
for val in self.parse_dates:
596+
if isinstance(val, list):
597+
for k in val:
598+
noconvert_columns.add(_set(k))
599+
else:
600+
noconvert_columns.add(_set(val))
601+
602+
elif isinstance(self.parse_dates, dict):
603+
for val in self.parse_dates.values():
604+
if isinstance(val, list):
605+
for k in val:
606+
noconvert_columns.add(_set(k))
607+
else:
608+
noconvert_columns.add(_set(val))
609+
610+
elif self.parse_dates:
611+
if isinstance(self.index_col, list):
612+
for k in self.index_col:
613+
noconvert_columns.add(_set(k))
614+
elif self.index_col is not None:
615+
noconvert_columns.add(_set(self.index_col))
616+
617+
return noconvert_columns
618+
549619
def _infer_types(self, values, na_values, try_num_bool=True):
550620
"""
551621
Infer types of values, possibly casting

pandas/io/parsers/c_parser_wrapper.py

+5-55
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,6 @@
11
import pandas._libs.parsers as parsers
22
from pandas._typing import FilePathOrBuffer
33

4-
from pandas.core.dtypes.common import is_integer
5-
64
from pandas.core.indexes.api import ensure_index_from_sequences
75

86
from pandas.io.parsers.base_parser import ParserBase, is_index_col
@@ -19,7 +17,6 @@ def __init__(self, src: FilePathOrBuffer, **kwds):
1917
kwds["allow_leading_cols"] = self.index_col is not False
2018

2119
# GH20529, validate usecol arg before TextReader
22-
self.usecols, self.usecols_dtype = self._validate_usecols_arg(kwds["usecols"])
2320
kwds["usecols"] = self.usecols
2421

2522
# open handles
@@ -159,58 +156,11 @@ def _set_noconvert_columns(self):
159156
Currently, any column that is involved with date parsing will not
160157
undergo such conversions.
161158
"""
162-
names = self.orig_names
163-
if self.usecols_dtype == "integer":
164-
# A set of integers will be converted to a list in
165-
# the correct order every single time.
166-
usecols = list(self.usecols)
167-
usecols.sort()
168-
elif callable(self.usecols) or self.usecols_dtype not in ("empty", None):
169-
# The names attribute should have the correct columns
170-
# in the proper order for indexing with parse_dates.
171-
usecols = self.names[:]
172-
else:
173-
# Usecols is empty.
174-
175-
# pandas\io\parsers.py:2030: error: Incompatible types in
176-
# assignment (expression has type "None", variable has type
177-
# "List[Any]") [assignment]
178-
usecols = None # type: ignore[assignment]
179-
180-
def _set(x):
181-
if usecols is not None and is_integer(x):
182-
x = usecols[x]
183-
184-
if not is_integer(x):
185-
# assert for mypy, names is List or None, None would error when calling
186-
# .index()
187-
assert names is not None
188-
x = names.index(x)
189-
190-
self._reader.set_noconvert(x)
191-
192-
if isinstance(self.parse_dates, list):
193-
for val in self.parse_dates:
194-
if isinstance(val, list):
195-
for k in val:
196-
_set(k)
197-
else:
198-
_set(val)
199-
200-
elif isinstance(self.parse_dates, dict):
201-
for val in self.parse_dates.values():
202-
if isinstance(val, list):
203-
for k in val:
204-
_set(k)
205-
else:
206-
_set(val)
207-
208-
elif self.parse_dates:
209-
if isinstance(self.index_col, list):
210-
for k in self.index_col:
211-
_set(k)
212-
elif self.index_col is not None:
213-
_set(self.index_col)
159+
assert self.orig_names is not None
160+
col_indices = [self.orig_names.index(x) for x in self.names]
161+
noconvert_columns = self._set_noconvert_dtype_columns(col_indices, self.names)
162+
for col in noconvert_columns:
163+
self._reader.set_noconvert(col)
214164

215165
def set_error_bad_lines(self, status):
216166
self._reader.set_error_bad_lines(int(status))

pandas/io/parsers/python_parser.py

+6-43
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
from io import StringIO
44
import re
55
import sys
6-
from typing import Iterator, List, Optional, cast
6+
from typing import Iterator, List, Optional, Set, cast
77

88
import numpy as np
99

@@ -53,7 +53,6 @@ def __init__(self, f: Union[FilePathOrBuffer, List], **kwds):
5353
self.skipinitialspace = kwds["skipinitialspace"]
5454
self.lineterminator = kwds["lineterminator"]
5555
self.quoting = kwds["quoting"]
56-
self.usecols, _ = self._validate_usecols_arg(kwds["usecols"])
5756
self.skip_blank_lines = kwds["skip_blank_lines"]
5857

5958
self.warn_bad_lines = kwds["warn_bad_lines"]
@@ -136,10 +135,12 @@ def __init__(self, f: Union[FilePathOrBuffer, List], **kwds):
136135
self._col_indices = list(range(len(self.columns)))
137136

138137
self._validate_parse_dates_presence(self.columns)
138+
no_thousands_columns: Optional[Set[int]] = None
139139
if self.parse_dates:
140-
self._no_thousands_columns = self._set_no_thousands_columns()
141-
else:
142-
self._no_thousands_columns = None
140+
no_thousands_columns = self._set_noconvert_dtype_columns(
141+
self._col_indices, self.columns
142+
)
143+
self._no_thousands_columns = no_thousands_columns
143144

144145
if len(self.decimal) != 1:
145146
raise ValueError("Only length-1 decimal markers supported")
@@ -155,44 +156,6 @@ def __init__(self, f: Union[FilePathOrBuffer, List], **kwds):
155156
)
156157
self.num = re.compile(regex)
157158

158-
def _set_no_thousands_columns(self):
159-
# Create a set of column ids that are not to be stripped of thousands
160-
# operators.
161-
noconvert_columns = set()
162-
163-
def _set(x):
164-
if is_integer(x):
165-
noconvert_columns.add(x)
166-
else:
167-
assert self._col_indices is not None
168-
col_indices = self._col_indices
169-
noconvert_columns.add(col_indices[self.columns.index(x)])
170-
171-
if isinstance(self.parse_dates, list):
172-
for val in self.parse_dates:
173-
if isinstance(val, list):
174-
for k in val:
175-
_set(k)
176-
else:
177-
_set(val)
178-
179-
elif isinstance(self.parse_dates, dict):
180-
for val in self.parse_dates.values():
181-
if isinstance(val, list):
182-
for k in val:
183-
_set(k)
184-
else:
185-
_set(val)
186-
187-
elif self.parse_dates:
188-
if isinstance(self.index_col, list):
189-
for k in self.index_col:
190-
_set(k)
191-
elif self.index_col is not None:
192-
_set(self.index_col)
193-
194-
return noconvert_columns
195-
196159
def _make_reader(self, f):
197160
sep = self.delimiter
198161

pandas/tests/io/parser/test_parse_dates.py

+18
Original file line numberDiff line numberDiff line change
@@ -1603,3 +1603,21 @@ def test_date_parser_and_names(all_parsers):
16031603
result = parser.read_csv(data, parse_dates=["B"], names=["B"])
16041604
expected = DataFrame({"B": ["y", "2"]}, index=["x", "1"])
16051605
tm.assert_frame_equal(result, expected)
1606+
1607+
1608+
def test_date_parser_usecols_thousands(all_parsers):
1609+
# GH#39365
1610+
data = """A,B,C
1611+
1,3,20-09-01-01
1612+
2,4,20-09-01-01
1613+
"""
1614+
1615+
parser = all_parsers
1616+
result = parser.read_csv(
1617+
StringIO(data),
1618+
parse_dates=[1],
1619+
usecols=[1, 2],
1620+
thousands="-",
1621+
)
1622+
expected = DataFrame({"B": [3, 4], "C": [Timestamp("20-09-2001 01:00:00")] * 2})
1623+
tm.assert_frame_equal(result, expected)

0 commit comments

Comments
 (0)