From 0e968fc70aaa5ef54aeb855dbada1658b11440c7 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Wed, 15 Mar 2023 02:10:10 +0100 Subject: [PATCH 1/2] BUG: read_csv for arrow with mismatching dtypes does not work --- pandas/io/parsers/c_parser_wrapper.py | 37 ++++--------------- .../io/parser/test_concatenate_chunks.py | 36 ++++++++++++++++++ 2 files changed, 43 insertions(+), 30 deletions(-) create mode 100644 pandas/tests/io/parser/test_concatenate_chunks.py diff --git a/pandas/io/parsers/c_parser_wrapper.py b/pandas/io/parsers/c_parser_wrapper.py index 4b8bc5c402157..2b24e4e873a92 100644 --- a/pandas/io/parsers/c_parser_wrapper.py +++ b/pandas/io/parsers/c_parser_wrapper.py @@ -23,8 +23,10 @@ is_categorical_dtype, pandas_dtype, ) -from pandas.core.dtypes.concat import union_categoricals -from pandas.core.dtypes.dtypes import ExtensionDtype +from pandas.core.dtypes.concat import ( + concat_compat, + union_categoricals, +) from pandas.core.indexes.api import ensure_index_from_sequences @@ -379,40 +381,15 @@ def _concatenate_chunks(chunks: list[dict[int, ArrayLike]]) -> dict: arrs = [chunk.pop(name) for chunk in chunks] # Check each arr for consistent types. dtypes = {a.dtype for a in arrs} - # TODO: shouldn't we exclude all EA dtypes here? numpy_dtypes = {x for x in dtypes if not is_categorical_dtype(x)} - if len(numpy_dtypes) > 1: - # error: Argument 1 to "find_common_type" has incompatible type - # "Set[Any]"; expected "Sequence[Union[dtype[Any], None, type, - # _SupportsDType, str, Union[Tuple[Any, int], Tuple[Any, - # Union[int, Sequence[int]]], List[Any], _DTypeDict, Tuple[Any, Any]]]]" - common_type = np.find_common_type( - numpy_dtypes, # type: ignore[arg-type] - [], - ) - if common_type == np.dtype(object): - warning_columns.append(str(name)) dtype = dtypes.pop() if is_categorical_dtype(dtype): result[name] = union_categoricals(arrs, sort_categories=False) - elif isinstance(dtype, ExtensionDtype): - # TODO: concat_compat? - array_type = dtype.construct_array_type() - # error: Argument 1 to "_concat_same_type" of "ExtensionArray" - # has incompatible type "List[Union[ExtensionArray, ndarray]]"; - # expected "Sequence[ExtensionArray]" - result[name] = array_type._concat_same_type(arrs) # type: ignore[arg-type] else: - # error: Argument 1 to "concatenate" has incompatible - # type "List[Union[ExtensionArray, ndarray[Any, Any]]]" - # ; expected "Union[_SupportsArray[dtype[Any]], - # Sequence[_SupportsArray[dtype[Any]]], - # Sequence[Sequence[_SupportsArray[dtype[Any]]]], - # Sequence[Sequence[Sequence[_SupportsArray[dtype[Any]]]]] - # , Sequence[Sequence[Sequence[Sequence[ - # _SupportsArray[dtype[Any]]]]]]]" - result[name] = np.concatenate(arrs) # type: ignore[arg-type] + result[name] = concat_compat(arrs) + if len(numpy_dtypes) > 1 and result[name].dtype == np.dtype(object): + warning_columns.append(str(name)) if warning_columns: warning_names = ",".join(warning_columns) diff --git a/pandas/tests/io/parser/test_concatenate_chunks.py b/pandas/tests/io/parser/test_concatenate_chunks.py new file mode 100644 index 0000000000000..1bae2317a2fc6 --- /dev/null +++ b/pandas/tests/io/parser/test_concatenate_chunks.py @@ -0,0 +1,36 @@ +import numpy as np +import pytest + +from pandas.errors import DtypeWarning + +import pandas._testing as tm +from pandas.core.arrays import ArrowExtensionArray + +from pandas.io.parsers.c_parser_wrapper import _concatenate_chunks + + +def test_concatenate_chunks_pyarrow(): + # GH#51876 + pa = pytest.importorskip("pyarrow") + chunks = [ + {0: ArrowExtensionArray(pa.array([1.5, 2.5]))}, + {0: ArrowExtensionArray(pa.array([1, 2]))}, + ] + result = _concatenate_chunks(chunks) + expected = ArrowExtensionArray(pa.array([1.5, 2.5, 1.0, 2.0])) + tm.assert_extension_array_equal(result[0], expected) + + +def test_concatenate_chunks_pyarrow_strings(): + # GH#51876 + pa = pytest.importorskip("pyarrow") + chunks = [ + {0: ArrowExtensionArray(pa.array([1.5, 2.5]))}, + {0: ArrowExtensionArray(pa.array(["a", "b"]))}, + ] + with tm.assert_produces_warning(DtypeWarning, match="have mixed types"): + result = _concatenate_chunks(chunks) + expected = np.concatenate( + [np.array([1.5, 2.5], dtype=object), np.array(["a", "b"])] + ) + tm.assert_numpy_array_equal(result[0], expected) From 559bda4f4265731c589a454ee2d17e07e4b27c3b Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Wed, 15 Mar 2023 16:57:05 +0100 Subject: [PATCH 2/2] Rename var --- pandas/io/parsers/c_parser_wrapper.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/io/parsers/c_parser_wrapper.py b/pandas/io/parsers/c_parser_wrapper.py index 2b24e4e873a92..3d3e343050421 100644 --- a/pandas/io/parsers/c_parser_wrapper.py +++ b/pandas/io/parsers/c_parser_wrapper.py @@ -381,14 +381,14 @@ def _concatenate_chunks(chunks: list[dict[int, ArrayLike]]) -> dict: arrs = [chunk.pop(name) for chunk in chunks] # Check each arr for consistent types. dtypes = {a.dtype for a in arrs} - numpy_dtypes = {x for x in dtypes if not is_categorical_dtype(x)} + non_cat_dtypes = {x for x in dtypes if not is_categorical_dtype(x)} dtype = dtypes.pop() if is_categorical_dtype(dtype): result[name] = union_categoricals(arrs, sort_categories=False) else: result[name] = concat_compat(arrs) - if len(numpy_dtypes) > 1 and result[name].dtype == np.dtype(object): + if len(non_cat_dtypes) > 1 and result[name].dtype == np.dtype(object): warning_columns.append(str(name)) if warning_columns: