Skip to content

REF: Unify _set_noconvert_dtype_columns for parsers #39365

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 6 commits into from
Jan 27, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v1.3.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -322,6 +322,7 @@ I/O
- Bug in :func:`to_hdf` raising ``KeyError`` when trying to apply for subclasses of ``DataFrame`` or ``Series`` (:issue:`33748`)
- Bug in :meth:`~HDFStore.put` raising a wrong ``TypeError`` when saving a DataFrame with non-string dtype (:issue:`34274`)
- Bug in :func:`json_normalize` resulting in the first element of a generator object not being included in the returned ``DataFrame`` (:issue:`35923`)
- Bug in :func:`read_csv` apllying thousands separator to date columns when column should be parsed for dates and ``usecols`` is specified for ``engine="python"`` (:issue:`39365`)
- Bug in :func:`read_excel` forward filling :class:`MultiIndex` names with multiple header and index columns specified (:issue:`34673`)
- :func:`read_excel` now respects :func:`set_option` (:issue:`34252`)
- Bug in :func:`read_csv` not switching ``true_values`` and ``false_values`` for nullable ``boolean`` dtype (:issue:`34655`)
Expand Down
70 changes: 70 additions & 0 deletions pandas/io/parsers/base_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -172,6 +172,8 @@ def __init__(self, kwds):

self._first_chunk = True

self.usecols, self.usecols_dtype = self._validate_usecols_arg(kwds["usecols"])

self.handles: Optional[IOHandles] = None

def _open_handles(self, src: FilePathOrBuffer, kwds: Dict[str, Any]) -> None:
Expand Down Expand Up @@ -546,6 +548,74 @@ def _convert_to_ndarrays(
print(f"Filled {na_count} NA values in column {c!s}")
return result

def _set_noconvert_dtype_columns(
self, col_indices: List[int], names: List[Union[int, str]]
) -> Set[int]:
"""
Set the columns that should not undergo dtype conversions.

Currently, any column that is involved with date parsing will not
undergo such conversions. If usecols is specified, the positions of the columns
not to cast is relative to the usecols not to all columns.

Parameters
----------
col_indices: The indices specifying order and positions of the columns
names: The column names which order is corresponding with the order
of col_indices

Returns
-------
A set of integers containing the positions of the columns not to convert.
"""
usecols: Optional[Union[List[int], List[str]]]
noconvert_columns = set()
if self.usecols_dtype == "integer":
# A set of integers will be converted to a list in
# the correct order every single time.
usecols = sorted(self.usecols)
elif callable(self.usecols) or self.usecols_dtype not in ("empty", None):
# The names attribute should have the correct columns
# in the proper order for indexing with parse_dates.
usecols = col_indices
else:
# Usecols is empty.
usecols = None

def _set(x) -> int:
if usecols is not None and is_integer(x):
x = usecols[x]

if not is_integer(x):
x = col_indices[names.index(x)]

return x

if isinstance(self.parse_dates, list):
for val in self.parse_dates:
if isinstance(val, list):
for k in val:
noconvert_columns.add(_set(k))
else:
noconvert_columns.add(_set(val))

elif isinstance(self.parse_dates, dict):
for val in self.parse_dates.values():
if isinstance(val, list):
for k in val:
noconvert_columns.add(_set(k))
else:
noconvert_columns.add(_set(val))

elif self.parse_dates:
if isinstance(self.index_col, list):
for k in self.index_col:
noconvert_columns.add(_set(k))
elif self.index_col is not None:
noconvert_columns.add(_set(self.index_col))

return noconvert_columns

def _infer_types(self, values, na_values, try_num_bool=True):
"""
Infer types of values, possibly casting
Expand Down
60 changes: 5 additions & 55 deletions pandas/io/parsers/c_parser_wrapper.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,6 @@
import pandas._libs.parsers as parsers
from pandas._typing import FilePathOrBuffer

from pandas.core.dtypes.common import is_integer

from pandas.core.indexes.api import ensure_index_from_sequences

from pandas.io.parsers.base_parser import ParserBase, is_index_col
Expand All @@ -19,7 +17,6 @@ def __init__(self, src: FilePathOrBuffer, **kwds):
kwds["allow_leading_cols"] = self.index_col is not False

# GH20529, validate usecol arg before TextReader
self.usecols, self.usecols_dtype = self._validate_usecols_arg(kwds["usecols"])
kwds["usecols"] = self.usecols

# open handles
Expand Down Expand Up @@ -159,58 +156,11 @@ def _set_noconvert_columns(self):
Currently, any column that is involved with date parsing will not
undergo such conversions.
"""
names = self.orig_names
if self.usecols_dtype == "integer":
# A set of integers will be converted to a list in
# the correct order every single time.
usecols = list(self.usecols)
usecols.sort()
elif callable(self.usecols) or self.usecols_dtype not in ("empty", None):
# The names attribute should have the correct columns
# in the proper order for indexing with parse_dates.
usecols = self.names[:]
else:
# Usecols is empty.

# pandas\io\parsers.py:2030: error: Incompatible types in
# assignment (expression has type "None", variable has type
# "List[Any]") [assignment]
usecols = None # type: ignore[assignment]

def _set(x):
if usecols is not None and is_integer(x):
x = usecols[x]

if not is_integer(x):
# assert for mypy, names is List or None, None would error when calling
# .index()
assert names is not None
x = names.index(x)

self._reader.set_noconvert(x)

if isinstance(self.parse_dates, list):
for val in self.parse_dates:
if isinstance(val, list):
for k in val:
_set(k)
else:
_set(val)

elif isinstance(self.parse_dates, dict):
for val in self.parse_dates.values():
if isinstance(val, list):
for k in val:
_set(k)
else:
_set(val)

elif self.parse_dates:
if isinstance(self.index_col, list):
for k in self.index_col:
_set(k)
elif self.index_col is not None:
_set(self.index_col)
assert self.orig_names is not None
col_indices = [self.orig_names.index(x) for x in self.names]
noconvert_columns = self._set_noconvert_dtype_columns(col_indices, self.names)
for col in noconvert_columns:
self._reader.set_noconvert(col)

def set_error_bad_lines(self, status):
self._reader.set_error_bad_lines(int(status))
Expand Down
49 changes: 6 additions & 43 deletions pandas/io/parsers/python_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from io import StringIO
import re
import sys
from typing import Iterator, List, Optional, cast
from typing import Iterator, List, Optional, Set, cast

import numpy as np

Expand Down Expand Up @@ -53,7 +53,6 @@ def __init__(self, f: Union[FilePathOrBuffer, List], **kwds):
self.skipinitialspace = kwds["skipinitialspace"]
self.lineterminator = kwds["lineterminator"]
self.quoting = kwds["quoting"]
self.usecols, _ = self._validate_usecols_arg(kwds["usecols"])
self.skip_blank_lines = kwds["skip_blank_lines"]

self.warn_bad_lines = kwds["warn_bad_lines"]
Expand Down Expand Up @@ -136,10 +135,12 @@ def __init__(self, f: Union[FilePathOrBuffer, List], **kwds):
self._col_indices = list(range(len(self.columns)))

self._validate_parse_dates_presence(self.columns)
no_thousands_columns: Optional[Set[int]] = None
if self.parse_dates:
self._no_thousands_columns = self._set_no_thousands_columns()
else:
self._no_thousands_columns = None
no_thousands_columns = self._set_noconvert_dtype_columns(
self._col_indices, self.columns
)
self._no_thousands_columns = no_thousands_columns

if len(self.decimal) != 1:
raise ValueError("Only length-1 decimal markers supported")
Expand All @@ -155,44 +156,6 @@ def __init__(self, f: Union[FilePathOrBuffer, List], **kwds):
)
self.num = re.compile(regex)

def _set_no_thousands_columns(self):
# Create a set of column ids that are not to be stripped of thousands
# operators.
noconvert_columns = set()

def _set(x):
if is_integer(x):
noconvert_columns.add(x)
else:
assert self._col_indices is not None
col_indices = self._col_indices
noconvert_columns.add(col_indices[self.columns.index(x)])

if isinstance(self.parse_dates, list):
for val in self.parse_dates:
if isinstance(val, list):
for k in val:
_set(k)
else:
_set(val)

elif isinstance(self.parse_dates, dict):
for val in self.parse_dates.values():
if isinstance(val, list):
for k in val:
_set(k)
else:
_set(val)

elif self.parse_dates:
if isinstance(self.index_col, list):
for k in self.index_col:
_set(k)
elif self.index_col is not None:
_set(self.index_col)

return noconvert_columns

def _make_reader(self, f):
sep = self.delimiter

Expand Down
18 changes: 18 additions & 0 deletions pandas/tests/io/parser/test_parse_dates.py
Original file line number Diff line number Diff line change
Expand Up @@ -1603,3 +1603,21 @@ def test_date_parser_and_names(all_parsers):
result = parser.read_csv(data, parse_dates=["B"], names=["B"])
expected = DataFrame({"B": ["y", "2"]}, index=["x", "1"])
tm.assert_frame_equal(result, expected)


def test_date_parser_usecols_thousands(all_parsers):
# GH#39365
data = """A,B,C
1,3,20-09-01-01
2,4,20-09-01-01
"""

parser = all_parsers
result = parser.read_csv(
StringIO(data),
parse_dates=[1],
usecols=[1, 2],
thousands="-",
)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this is a bug yes? can you add a whatsnew note

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yep, added

expected = DataFrame({"B": [3, 4], "C": [Timestamp("20-09-2001 01:00:00")] * 2})
tm.assert_frame_equal(result, expected)