-
-
Notifications
You must be signed in to change notification settings - Fork 18.4k
REF: move union_categoricals call outside of cython #40964
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 3 commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,5 +1,22 @@ | ||
from __future__ import annotations | ||
|
||
import warnings | ||
|
||
import numpy as np | ||
|
||
import pandas._libs.parsers as parsers | ||
from pandas._typing import FilePathOrBuffer | ||
from pandas._typing import ( | ||
ArrayLike, | ||
FilePathOrBuffer, | ||
) | ||
from pandas.errors import DtypeWarning | ||
|
||
from pandas.core.dtypes.common import ( | ||
is_categorical_dtype, | ||
pandas_dtype, | ||
) | ||
from pandas.core.dtypes.concat import union_categoricals | ||
from pandas.core.dtypes.dtypes import ExtensionDtype | ||
|
||
from pandas.core.indexes.api import ensure_index_from_sequences | ||
|
||
|
@@ -10,12 +27,16 @@ | |
|
||
|
||
class CParserWrapper(ParserBase): | ||
low_memory: bool | ||
|
||
def __init__(self, src: FilePathOrBuffer, **kwds): | ||
self.kwds = kwds | ||
kwds = kwds.copy() | ||
|
||
ParserBase.__init__(self, kwds) | ||
|
||
self.low_memory = kwds.pop("low_memory", False) | ||
|
||
# #2442 | ||
kwds["allow_leading_cols"] = self.index_col is not False | ||
|
||
|
@@ -47,6 +68,7 @@ def __init__(self, src: FilePathOrBuffer, **kwds): | |
# TextIOBase, TextIOWrapper, mmap]" has no attribute "mmap" | ||
self.handles.handle = self.handles.handle.mmap # type: ignore[union-attr] | ||
|
||
kwds["dtype"] = ensure_dtype_objs(kwds.get("dtype", None)) | ||
try: | ||
self._reader = parsers.TextReader(self.handles.handle, **kwds) | ||
except Exception: | ||
|
@@ -160,7 +182,13 @@ def set_error_bad_lines(self, status): | |
|
||
def read(self, nrows=None): | ||
try: | ||
data = self._reader.read(nrows) | ||
if self.low_memory: | ||
chunks = self._reader.read_low_memory(nrows) | ||
# destructive to chunks | ||
data = _concatenate_chunks(chunks) | ||
|
||
else: | ||
data = self._reader.read(nrows) | ||
except StopIteration: | ||
if self._first_chunk: | ||
self._first_chunk = False | ||
|
@@ -265,7 +293,76 @@ def _get_index_names(self): | |
|
||
return names, idx_names | ||
|
||
def _maybe_parse_dates(self, values, index, try_parse_dates=True): | ||
def _maybe_parse_dates(self, values, index: int, try_parse_dates=True): | ||
if try_parse_dates and self._should_parse_dates(index): | ||
values = self._date_conv(values) | ||
return values | ||
|
||
|
||
def _concatenate_chunks(chunks: list[dict[int, ArrayLike]]) -> dict: | ||
""" | ||
Concatenate chunks of data read with low_memory=True. | ||
|
||
The tricky part is handling Categoricals, where different chunks | ||
may have different inferred categories. | ||
""" | ||
names = list(chunks[0].keys()) | ||
warning_columns = [] | ||
|
||
result = {} | ||
for name in names: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. do we not have a generic pandas concat call to handle this? or is that for a followup There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
im hopeful we can use something more generic, but havent tried it out yet (comment on L336) |
||
arrs = [chunk.pop(name) for chunk in chunks] | ||
# Check each arr for consistent types. | ||
dtypes = {a.dtype for a in arrs} | ||
# TODO: shouldn't we exclude all EA dtypes here? | ||
numpy_dtypes = {x for x in dtypes if not is_categorical_dtype(x)} | ||
if len(numpy_dtypes) > 1: | ||
# error: Argument 1 to "find_common_type" has incompatible type | ||
# "Set[Any]"; expected "Sequence[Union[dtype[Any], None, type, | ||
# _SupportsDType, str, Union[Tuple[Any, int], Tuple[Any, | ||
# Union[int, Sequence[int]]], List[Any], _DTypeDict, Tuple[Any, Any]]]]" | ||
common_type = np.find_common_type( | ||
numpy_dtypes, # type: ignore[arg-type] | ||
[], | ||
) | ||
if common_type == object: | ||
warning_columns.append(str(name)) | ||
|
||
dtype = dtypes.pop() | ||
if is_categorical_dtype(dtype): | ||
result[name] = union_categoricals(arrs, sort_categories=False) | ||
else: | ||
if isinstance(dtype, ExtensionDtype): | ||
# TODO: concat_compat? | ||
array_type = dtype.construct_array_type() | ||
# error: Argument 1 to "_concat_same_type" of "ExtensionArray" | ||
# has incompatible type "List[Union[ExtensionArray, ndarray]]"; | ||
# expected "Sequence[ExtensionArray]" | ||
result[name] = array_type._concat_same_type( | ||
arrs # type: ignore[arg-type] | ||
) | ||
else: | ||
result[name] = np.concatenate(arrs) | ||
|
||
if warning_columns: | ||
warning_names = ",".join(warning_columns) | ||
warning_message = " ".join( | ||
[ | ||
f"Columns ({warning_names}) have mixed types." | ||
f"Specify dtype option on import or set low_memory=False." | ||
] | ||
) | ||
warnings.warn(warning_message, DtypeWarning, stacklevel=8) | ||
return result | ||
|
||
|
||
def ensure_dtype_objs(dtype): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. don't we do this more generally elsewhere? e.g. astype should be relocated There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. if we did it would probably be in io.common and i dont see it there |
||
""" | ||
Ensure we have either None, a dtype object, or a dictionary mapping to | ||
dtype objects. | ||
""" | ||
if isinstance(dtype, dict): | ||
dtype = {k: pandas_dtype(dtype[k]) for k in dtype} | ||
elif dtype is not None: | ||
dtype = pandas_dtype(dtype) | ||
return dtype |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
typo