Skip to content

Commit fb47c75

Browse files
authored
BUG: read_csv not converting to float for python engine with decimal sep, usecols and parse_dates (#38334)
1 parent 5cc24c2 commit fb47c75

File tree

5 files changed

+31
-14
lines changed

5 files changed

+31
-14
lines changed

doc/source/whatsnew/v1.3.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -264,6 +264,7 @@ I/O
264264
- Bug in :func:`read_csv` interpreting ``NA`` value as comment, when ``NA`` does contain the comment string fixed for ``engine="python"`` (:issue:`34002`)
265265
- Bug in :func:`read_csv` raising ``IndexError`` with multiple header columns and ``index_col`` specified when file has no data rows (:issue:`38292`)
266266
- Bug in :func:`read_csv` not accepting ``usecols`` with different length than ``names`` for ``engine="python"`` (:issue:`16469`)
267+
- Bug in :meth:`read_csv` returning object dtype when ``delimiter=","`` with ``usecols`` and ``parse_dates`` specified for ``engine="python"`` (:issue:`35873`)
267268
- Bug in :func:`read_csv` raising ``TypeError`` when ``names`` and ``parse_dates`` is specified for ``engine="c"`` (:issue:`33699`)
268269
- Bug in :func:`read_clipboard`, :func:`DataFrame.to_clipboard` not working in WSL (:issue:`38527`)
269270
- Allow custom error values for parse_dates argument of :func:`read_sql`, :func:`read_sql_query` and :func:`read_sql_table` (:issue:`35185`)

pandas/io/parsers.py

+13-13
Original file line numberDiff line numberDiff line change
@@ -2293,7 +2293,7 @@ def __init__(self, f: Union[FilePathOrBuffer, List], **kwds):
22932293

22942294
# Get columns in two steps: infer from data, then
22952295
# infer column indices from self.usecols if it is specified.
2296-
self._col_indices = None
2296+
self._col_indices: Optional[List[int]] = None
22972297
try:
22982298
(
22992299
self.columns,
@@ -2335,6 +2335,9 @@ def __init__(self, f: Union[FilePathOrBuffer, List], **kwds):
23352335
if self.index_names is None:
23362336
self.index_names = index_names
23372337

2338+
if self._col_indices is None:
2339+
self._col_indices = list(range(len(self.columns)))
2340+
23382341
self._validate_parse_dates_presence(self.columns)
23392342
if self.parse_dates:
23402343
self._no_thousands_columns = self._set_no_thousands_columns()
@@ -2364,7 +2367,9 @@ def _set(x):
23642367
if is_integer(x):
23652368
noconvert_columns.add(x)
23662369
else:
2367-
noconvert_columns.add(self.columns.index(x))
2370+
assert self._col_indices is not None
2371+
col_indices = self._col_indices
2372+
noconvert_columns.add(col_indices[self.columns.index(x)])
23682373

23692374
if isinstance(self.parse_dates, list):
23702375
for val in self.parse_dates:
@@ -2706,7 +2711,6 @@ def _infer_columns(self):
27062711
# overwritten.
27072712
self._handle_usecols(columns, names)
27082713
else:
2709-
self._col_indices = None
27102714
num_original_columns = len(names)
27112715
columns = [names]
27122716
else:
@@ -2788,7 +2792,7 @@ def _handle_usecols(self, columns, usecols_key):
27882792
[n for i, n in enumerate(column) if i in col_indices]
27892793
for column in columns
27902794
]
2791-
self._col_indices = col_indices
2795+
self._col_indices = sorted(col_indices)
27922796
return columns
27932797

27942798
def _buffered_line(self):
@@ -3186,25 +3190,21 @@ def _rows_to_cols(self, content):
31863190
zipped_content = list(lib.to_object_array(content, min_width=col_len).T)
31873191

31883192
if self.usecols:
3193+
assert self._col_indices is not None
3194+
col_indices = self._col_indices
3195+
31893196
if self._implicit_index:
31903197
zipped_content = [
31913198
a
31923199
for i, a in enumerate(zipped_content)
31933200
if (
31943201
i < len(self.index_col)
3195-
# pandas\io\parsers.py:3159: error: Unsupported right
3196-
# operand type for in ("Optional[Any]") [operator]
3197-
or i - len(self.index_col) # type: ignore[operator]
3198-
in self._col_indices
3202+
or i - len(self.index_col) in col_indices
31993203
)
32003204
]
32013205
else:
32023206
zipped_content = [
3203-
# pandas\io\parsers.py:3164: error: Unsupported right
3204-
# operand type for in ("Optional[Any]") [operator]
3205-
a
3206-
for i, a in enumerate(zipped_content)
3207-
if i in self._col_indices # type: ignore[operator]
3207+
a for i, a in enumerate(zipped_content) if i in col_indices
32083208
]
32093209
return zipped_content
32103210

pandas/tests/io/parser/dtypes/__init__.py

Whitespace-only changes.

pandas/tests/io/parser/dtypes/test_dtypes_basic.py

+17-1
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
from pandas.errors import ParserWarning
1111

1212
import pandas as pd
13-
from pandas import DataFrame
13+
from pandas import DataFrame, Timestamp
1414
import pandas._testing as tm
1515

1616

@@ -165,3 +165,19 @@ def test_boolean_dtype(all_parsers):
165165
)
166166

167167
tm.assert_frame_equal(result, expected)
168+
169+
170+
def test_delimiter_with_usecols_and_parse_dates(all_parsers):
171+
# GH#35873
172+
result = all_parsers.read_csv(
173+
StringIO('"dump","-9,1","-9,1",20101010'),
174+
engine="python",
175+
names=["col", "col1", "col2", "col3"],
176+
usecols=["col1", "col2", "col3"],
177+
parse_dates=["col3"],
178+
decimal=",",
179+
)
180+
expected = DataFrame(
181+
{"col1": [-9.1], "col2": [-9.1], "col3": [Timestamp("2010-10-10")]}
182+
)
183+
tm.assert_frame_equal(result, expected)

pandas/tests/io/parser/usecols/__init__.py

Whitespace-only changes.

0 commit comments

Comments
 (0)