-
-
Notifications
You must be signed in to change notification settings - Fork 18.5k
REF: Unify _set_noconvert_dtype_columns for parsers #39365
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 2 commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -172,6 +172,8 @@ def __init__(self, kwds): | |
|
||
self._first_chunk = True | ||
|
||
self.usecols, self.usecols_dtype = self._validate_usecols_arg(kwds["usecols"]) | ||
|
||
self.handles: Optional[IOHandles] = None | ||
|
||
def _open_handles(self, src: FilePathOrBuffer, kwds: Dict[str, Any]) -> None: | ||
|
@@ -546,6 +548,65 @@ def _convert_to_ndarrays( | |
print(f"Filled {na_count} NA values in column {c!s}") | ||
return result | ||
|
||
def _set_noconvert_dtype_columns(self, col_indices, names): | ||
""" | ||
Set the columns that should not undergo dtype conversions. | ||
|
||
Currently, any column that is involved with date parsing will not | ||
undergo such conversions. | ||
""" | ||
noconvert_columns = set() | ||
if self.usecols_dtype == "integer": | ||
# A set of integers will be converted to a list in | ||
# the correct order every single time. | ||
usecols = list(self.usecols) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. -> sorted There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Safe-sort, because could be mixed. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. no could not, sorry. Used sorted |
||
usecols.sort() | ||
elif callable(self.usecols) or self.usecols_dtype not in ("empty", None): | ||
# The names attribute should have the correct columns | ||
# in the proper order for indexing with parse_dates. | ||
usecols = col_indices | ||
else: | ||
# Usecols is empty. | ||
|
||
# pandas\io\parsers.py:2030: error: Incompatible types in | ||
# assignment (expression has type "None", variable has type | ||
# "List[Any]") [assignment] | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. i think if you predeclare at the top
you can remove the ignore There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yep, this works, typed it a bit more specific. |
||
usecols = None # type: ignore[assignment] | ||
|
||
def _set(x): | ||
if usecols is not None and is_integer(x): | ||
x = usecols[x] | ||
|
||
if not is_integer(x): | ||
x = col_indices[names.index(x)] | ||
|
||
jreback marked this conversation as resolved.
Show resolved
Hide resolved
|
||
noconvert_columns.add(x) | ||
|
||
if isinstance(self.parse_dates, list): | ||
for val in self.parse_dates: | ||
if isinstance(val, list): | ||
for k in val: | ||
_set(k) | ||
else: | ||
_set(val) | ||
|
||
elif isinstance(self.parse_dates, dict): | ||
for val in self.parse_dates.values(): | ||
if isinstance(val, list): | ||
for k in val: | ||
_set(k) | ||
else: | ||
_set(val) | ||
|
||
elif self.parse_dates: | ||
if isinstance(self.index_col, list): | ||
for k in self.index_col: | ||
_set(k) | ||
elif self.index_col is not None: | ||
_set(self.index_col) | ||
|
||
return noconvert_columns | ||
|
||
def _infer_types(self, values, na_values, try_num_bool=True): | ||
""" | ||
Infer types of values, possibly casting | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -1603,3 +1603,21 @@ def test_date_parser_and_names(all_parsers): | |
result = parser.read_csv(data, parse_dates=["B"], names=["B"]) | ||
expected = DataFrame({"B": ["y", "2"]}, index=["x", "1"]) | ||
tm.assert_frame_equal(result, expected) | ||
|
||
|
||
def test_date_parser_usecols_thousands(all_parsers): | ||
# GH#39365 | ||
data = """A,B,C | ||
1,3,20-09-01-01 | ||
2,4,20-09-01-01 | ||
""" | ||
|
||
parser = all_parsers | ||
result = parser.read_csv( | ||
StringIO(data), | ||
parse_dates=[1], | ||
usecols=[1, 2], | ||
thousands="-", | ||
) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this is a bug yes? can you add a whatsnew note There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yep, added |
||
expected = DataFrame({"B": [3, 4], "C": [Timestamp("20-09-2001 01:00:00")] * 2}) | ||
tm.assert_frame_equal(result, expected) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
can you type this in any way? (esp the return value) and add a doc-string
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done