Skip to content

REF: move union_categoricals call outside of cython #40964

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 4 commits into from
Apr 28, 2021
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion pandas/_libs/parsers.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,6 @@ class TextReader:
true_values=...,
false_values=...,
allow_leading_cols: bool = ...,
low_memory: bool = ...,
skiprows=...,
skipfooter: int = ..., # int64_t
verbose: bool = ...,
Expand All @@ -75,3 +74,4 @@ class TextReader:
def close(self) -> None: ...

def read(self, rows: int | None = ...) -> dict[int, ArrayLike]: ...
def read_low_memory(self, rows: int | None) -> list[dict[int, ArrayLike]]: ...
80 changes: 15 additions & 65 deletions pandas/_libs/parsers.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,6 @@ from pandas._libs.khash cimport (
)

from pandas.errors import (
DtypeWarning,
EmptyDataError,
ParserError,
ParserWarning,
Expand All @@ -108,9 +107,7 @@ from pandas.core.dtypes.common import (
is_float_dtype,
is_integer_dtype,
is_object_dtype,
pandas_dtype,
)
from pandas.core.dtypes.concat import union_categoricals

cdef:
float64_t INF = <float64_t>np.inf
Expand Down Expand Up @@ -317,7 +314,7 @@ cdef class TextReader:

cdef public:
int64_t leading_cols, table_width, skipfooter, buffer_lines
bint allow_leading_cols, mangle_dupe_cols, low_memory
bint allow_leading_cols, mangle_dupe_cols
bint delim_whitespace
object delimiter # bytes or str
object converters
Expand Down Expand Up @@ -362,7 +359,6 @@ cdef class TextReader:
true_values=None,
false_values=None,
bint allow_leading_cols=True,
bint low_memory=False,
skiprows=None,
skipfooter=0, # int64_t
bint verbose=False,
Expand Down Expand Up @@ -479,7 +475,6 @@ cdef class TextReader:
self.na_filter = na_filter

self.verbose = verbose
self.low_memory = low_memory

if float_precision == "round_trip":
# see gh-15140
Expand All @@ -492,12 +487,10 @@ cdef class TextReader:
raise ValueError(f'Unrecognized float_precision option: '
f'{float_precision}')

if isinstance(dtype, dict):
dtype = {k: pandas_dtype(dtype[k])
for k in dtype}
elif dtype is not None:
dtype = pandas_dtype(dtype)

# Caller is responsible for ensuring we have one of
# - None
# - DtypeObj
# - dict[Any, DtypeObj]
self.dtype = dtype

# XXX
Expand Down Expand Up @@ -773,17 +766,18 @@ cdef class TextReader:
"""
rows=None --> read all rows
"""
if self.low_memory:
# Conserve intermediate space
columns = self._read_low_memory(rows)
else:
# Don't care about memory usage
columns = self._read_rows(rows, 1)
# Don't care about memory usage
columns = self._read_rows(rows, 1)

return columns

# -> dict[int, "ArrayLike"]
cdef _read_low_memory(self, rows):
def read_low_memory(self, rows: int | None)-> list[dict[int, "ArrayLike"]]:
"""
rows=None --> read all rows
"""
# Conserve intermediate space
# Caller is responsible for concatenating chunks,
# see c_parser_wrapper._concatenatve_chunks
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

typo

cdef:
size_t rows_read = 0
list chunks = []
Expand Down Expand Up @@ -818,8 +812,7 @@ cdef class TextReader:
if len(chunks) == 0:
raise StopIteration

# destructive to chunks
return _concatenate_chunks(chunks)
return chunks

cdef _tokenize_rows(self, size_t nrows):
cdef:
Expand Down Expand Up @@ -1907,49 +1900,6 @@ cdef raise_parser_error(object base, parser_t *parser):
raise ParserError(message)


# chunks: list[dict[int, "ArrayLike"]]
# -> dict[int, "ArrayLike"]
def _concatenate_chunks(list chunks) -> dict:
cdef:
list names = list(chunks[0].keys())
object name
list warning_columns = []
object warning_names
object common_type

result = {}
for name in names:
arrs = [chunk.pop(name) for chunk in chunks]
# Check each arr for consistent types.
dtypes = {a.dtype for a in arrs}
numpy_dtypes = {x for x in dtypes if not is_categorical_dtype(x)}
if len(numpy_dtypes) > 1:
common_type = np.find_common_type(numpy_dtypes, [])
if common_type == object:
warning_columns.append(str(name))

dtype = dtypes.pop()
if is_categorical_dtype(dtype):
sort_categories = isinstance(dtype, str)
result[name] = union_categoricals(arrs,
sort_categories=sort_categories)
else:
if is_extension_array_dtype(dtype):
array_type = dtype.construct_array_type()
result[name] = array_type._concat_same_type(arrs)
else:
result[name] = np.concatenate(arrs)

if warning_columns:
warning_names = ','.join(warning_columns)
warning_message = " ".join([
f"Columns ({warning_names}) have mixed types."
f"Specify dtype option on import or set low_memory=False."
])
warnings.warn(warning_message, DtypeWarning, stacklevel=8)
return result


# ----------------------------------------------------------------------
# NA values
def _compute_na_values():
Expand Down
103 changes: 100 additions & 3 deletions pandas/io/parsers/c_parser_wrapper.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,22 @@
from __future__ import annotations

import warnings

import numpy as np

import pandas._libs.parsers as parsers
from pandas._typing import FilePathOrBuffer
from pandas._typing import (
ArrayLike,
FilePathOrBuffer,
)
from pandas.errors import DtypeWarning

from pandas.core.dtypes.common import (
is_categorical_dtype,
pandas_dtype,
)
from pandas.core.dtypes.concat import union_categoricals
from pandas.core.dtypes.dtypes import ExtensionDtype

from pandas.core.indexes.api import ensure_index_from_sequences

Expand All @@ -10,12 +27,16 @@


class CParserWrapper(ParserBase):
low_memory: bool

def __init__(self, src: FilePathOrBuffer, **kwds):
self.kwds = kwds
kwds = kwds.copy()

ParserBase.__init__(self, kwds)

self.low_memory = kwds.pop("low_memory", False)

# #2442
kwds["allow_leading_cols"] = self.index_col is not False

Expand Down Expand Up @@ -47,6 +68,7 @@ def __init__(self, src: FilePathOrBuffer, **kwds):
# TextIOBase, TextIOWrapper, mmap]" has no attribute "mmap"
self.handles.handle = self.handles.handle.mmap # type: ignore[union-attr]

kwds["dtype"] = ensure_dtype_objs(kwds.get("dtype", None))
try:
self._reader = parsers.TextReader(self.handles.handle, **kwds)
except Exception:
Expand Down Expand Up @@ -160,7 +182,13 @@ def set_error_bad_lines(self, status):

def read(self, nrows=None):
try:
data = self._reader.read(nrows)
if self.low_memory:
chunks = self._reader.read_low_memory(nrows)
# destructive to chunks
data = _concatenate_chunks(chunks)

else:
data = self._reader.read(nrows)
except StopIteration:
if self._first_chunk:
self._first_chunk = False
Expand Down Expand Up @@ -265,7 +293,76 @@ def _get_index_names(self):

return names, idx_names

def _maybe_parse_dates(self, values, index, try_parse_dates=True):
def _maybe_parse_dates(self, values, index: int, try_parse_dates=True):
if try_parse_dates and self._should_parse_dates(index):
values = self._date_conv(values)
return values


def _concatenate_chunks(chunks: list[dict[int, ArrayLike]]) -> dict:
"""
Concatenate chunks of data read with low_memory=True.

The tricky part is handling Categoricals, where different chunks
may have different inferred categories.
"""
names = list(chunks[0].keys())
warning_columns = []

result = {}
for name in names:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

do we not have a generic pandas concat call to handle this? or is that for a followup

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

do we not have a generic pandas concat call to handle this? or is that for a followup

im hopeful we can use something more generic, but havent tried it out yet (comment on L336)

arrs = [chunk.pop(name) for chunk in chunks]
# Check each arr for consistent types.
dtypes = {a.dtype for a in arrs}
# TODO: shouldn't we exclude all EA dtypes here?
numpy_dtypes = {x for x in dtypes if not is_categorical_dtype(x)}
if len(numpy_dtypes) > 1:
# error: Argument 1 to "find_common_type" has incompatible type
# "Set[Any]"; expected "Sequence[Union[dtype[Any], None, type,
# _SupportsDType, str, Union[Tuple[Any, int], Tuple[Any,
# Union[int, Sequence[int]]], List[Any], _DTypeDict, Tuple[Any, Any]]]]"
common_type = np.find_common_type(
numpy_dtypes, # type: ignore[arg-type]
[],
)
if common_type == object:
warning_columns.append(str(name))

dtype = dtypes.pop()
if is_categorical_dtype(dtype):
result[name] = union_categoricals(arrs, sort_categories=False)
else:
if isinstance(dtype, ExtensionDtype):
# TODO: concat_compat?
array_type = dtype.construct_array_type()
# error: Argument 1 to "_concat_same_type" of "ExtensionArray"
# has incompatible type "List[Union[ExtensionArray, ndarray]]";
# expected "Sequence[ExtensionArray]"
result[name] = array_type._concat_same_type(
arrs # type: ignore[arg-type]
)
else:
result[name] = np.concatenate(arrs)

if warning_columns:
warning_names = ",".join(warning_columns)
warning_message = " ".join(
[
f"Columns ({warning_names}) have mixed types."
f"Specify dtype option on import or set low_memory=False."
]
)
warnings.warn(warning_message, DtypeWarning, stacklevel=8)
return result


def ensure_dtype_objs(dtype):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

don't we do this more generally elsewhere? e.g. astype should be relocated

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

if we did it would probably be in io.common and i dont see it there

"""
Ensure we have either None, a dtype object, or a dictionary mapping to
dtype objects.
"""
if isinstance(dtype, dict):
dtype = {k: pandas_dtype(dtype[k]) for k in dtype}
elif dtype is not None:
dtype = pandas_dtype(dtype)
return dtype
5 changes: 5 additions & 0 deletions pandas/tests/io/parser/test_textreader.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
TextFileReader,
read_csv,
)
from pandas.io.parsers.c_parser_wrapper import ensure_dtype_objs


class TestTextReader:
Expand Down Expand Up @@ -206,6 +207,8 @@ def test_numpy_string_dtype(self):
aaaaa,5"""

def _make_reader(**kwds):
if "dtype" in kwds:
kwds["dtype"] = ensure_dtype_objs(kwds["dtype"])
return TextReader(StringIO(data), delimiter=",", header=None, **kwds)

reader = _make_reader(dtype="S5,i4")
Expand Down Expand Up @@ -233,6 +236,8 @@ def test_pass_dtype(self):
4,d"""

def _make_reader(**kwds):
if "dtype" in kwds:
kwds["dtype"] = ensure_dtype_objs(kwds["dtype"])
return TextReader(StringIO(data), delimiter=",", **kwds)

reader = _make_reader(dtype={"one": "u1", 1: "S1"})
Expand Down