From 38e37be3baa396199e210f52e5066f487db554ea Mon Sep 17 00:00:00 2001 From: "Uwe L. Korn" Date: Wed, 31 Jul 2019 15:09:52 +0200 Subject: [PATCH 1/3] Handle construction of string ExtensionArray from lists --- pandas/core/construction.py | 4 +- .../extension/arrow/{bool.py => arrays.py} | 53 +++++++++++++++---- pandas/tests/extension/arrow/test_bool.py | 2 +- pandas/tests/extension/arrow/test_string.py | 13 +++++ 4 files changed, 61 insertions(+), 11 deletions(-) rename pandas/tests/extension/arrow/{bool.py => arrays.py} (80%) create mode 100644 pandas/tests/extension/arrow/test_string.py diff --git a/pandas/core/construction.py b/pandas/core/construction.py index 9528723a6dc0f..b31f466569875 100644 --- a/pandas/core/construction.py +++ b/pandas/core/construction.py @@ -470,7 +470,9 @@ def sanitize_array(data, index, dtype=None, copy=False, raise_cast_failure=False # This is to prevent mixed-type Series getting all casted to # NumPy string type, e.g. NaN --> '-1#IND'. - if issubclass(subarr.dtype.type, str): + if not ( + is_extension_array_dtype(subarr.dtype) or is_extension_array_dtype(dtype) + ) and issubclass(subarr.dtype.type, str): # GH#16605 # If not empty convert the data to dtype # GH#19853: If data is a scalar, subarr has already the result diff --git a/pandas/tests/extension/arrow/bool.py b/pandas/tests/extension/arrow/arrays.py similarity index 80% rename from pandas/tests/extension/arrow/bool.py rename to pandas/tests/extension/arrow/arrays.py index eb75d6d968073..3b6b0455f9a5d 100644 --- a/pandas/tests/extension/arrow/bool.py +++ b/pandas/tests/extension/arrow/arrays.py @@ -43,18 +43,27 @@ def _is_boolean(self): return True -class ArrowBoolArray(ExtensionArray): - def __init__(self, values): - if not isinstance(values, pa.ChunkedArray): - raise ValueError +@register_extension_dtype +class ArrowStringDtype(ExtensionDtype): - assert values.type == pa.bool_() - self._data = values - self._dtype = ArrowBoolDtype() + type = str + kind = "U" + name = "arrow_string" + na_value = pa.NULL - def __repr__(self): - return "ArrowBoolArray({})".format(repr(self._data)) + @classmethod + def construct_from_string(cls, string): + if string == cls.name: + return cls() + else: + raise TypeError("Cannot construct a '{}' from '{}'".format(cls, string)) + @classmethod + def construct_array_type(cls): + return ArrowStringArray + + +class ArrowExtensionArray(ExtensionArray): @classmethod def from_scalars(cls, values): arr = pa.chunked_array([pa.array(np.asarray(values))]) @@ -142,3 +151,29 @@ def any(self, axis=0, out=None): def all(self, axis=0, out=None): return self._data.to_pandas().all() + + +class ArrowBoolArray(ArrowExtensionArray): + def __init__(self, values): + if not isinstance(values, pa.ChunkedArray): + raise ValueError + + assert values.type == pa.bool_() + self._data = values + self._dtype = ArrowBoolDtype() + + def __repr__(self): + return "ArrowBoolArray({})".format(repr(self._data)) + + +class ArrowStringArray(ArrowExtensionArray): + def __init__(self, values): + if not isinstance(values, pa.ChunkedArray): + raise ValueError + + assert values.type == pa.string() + self._data = values + self._dtype = ArrowStringDtype() + + def __repr__(self): + return "ArrowStringArray({})".format(repr(self._data)) diff --git a/pandas/tests/extension/arrow/test_bool.py b/pandas/tests/extension/arrow/test_bool.py index 205edf5da5b74..cc0deca765b41 100644 --- a/pandas/tests/extension/arrow/test_bool.py +++ b/pandas/tests/extension/arrow/test_bool.py @@ -7,7 +7,7 @@ pytest.importorskip("pyarrow", minversion="0.10.0") -from .bool import ArrowBoolArray, ArrowBoolDtype # isort:skip +from .arrays import ArrowBoolArray, ArrowBoolDtype # isort:skip @pytest.fixture diff --git a/pandas/tests/extension/arrow/test_string.py b/pandas/tests/extension/arrow/test_string.py new file mode 100644 index 0000000000000..06f149aa4b75f --- /dev/null +++ b/pandas/tests/extension/arrow/test_string.py @@ -0,0 +1,13 @@ +import pytest + +import pandas as pd + +pytest.importorskip("pyarrow", minversion="0.10.0") + +from .arrays import ArrowStringDtype # isort:skip + + +def test_constructor_from_list(): + # GH 27673 + result = pd.Series(["E"], dtype=ArrowStringDtype()) + assert isinstance(result.dtype, ArrowStringDtype) From 503335d93eff7cdaa053fafa0539586b95d3bf1b Mon Sep 17 00:00:00 2001 From: "Uwe L. Korn" Date: Thu, 1 Aug 2019 09:56:28 +0200 Subject: [PATCH 2/3] Pull ExtensionArray check one level up --- pandas/core/construction.py | 45 +++++++++++++++++-------------------- 1 file changed, 20 insertions(+), 25 deletions(-) diff --git a/pandas/core/construction.py b/pandas/core/construction.py index b31f466569875..0c25cdf121cbb 100644 --- a/pandas/core/construction.py +++ b/pandas/core/construction.py @@ -468,32 +468,27 @@ def sanitize_array(data, index, dtype=None, copy=False, raise_cast_failure=False else: subarr = com.asarray_tuplesafe(data, dtype=dtype) - # This is to prevent mixed-type Series getting all casted to - # NumPy string type, e.g. NaN --> '-1#IND'. - if not ( - is_extension_array_dtype(subarr.dtype) or is_extension_array_dtype(dtype) - ) and issubclass(subarr.dtype.type, str): - # GH#16605 - # If not empty convert the data to dtype - # GH#19853: If data is a scalar, subarr has already the result - if not lib.is_scalar(data): - if not np.all(isna(data)): - data = np.array(data, dtype=dtype, copy=False) - subarr = np.array(data, dtype=object, copy=copy) - - if ( - not (is_extension_array_dtype(subarr.dtype) or is_extension_array_dtype(dtype)) - and is_object_dtype(subarr.dtype) - and not is_object_dtype(dtype) - ): - inferred = lib.infer_dtype(subarr, skipna=False) - if inferred == "period": - from pandas.core.arrays import period_array + if not (is_extension_array_dtype(subarr.dtype) or is_extension_array_dtype(dtype)): + # This is to prevent mixed-type Series getting all casted to + # NumPy string type, e.g. NaN --> '-1#IND'. + if issubclass(subarr.dtype.type, str): + # GH#16605 + # If not empty convert the data to dtype + # GH#19853: If data is a scalar, subarr has already the result + if not lib.is_scalar(data): + if not np.all(isna(data)): + data = np.array(data, dtype=dtype, copy=False) + subarr = np.array(data, dtype=object, copy=copy) - try: - subarr = period_array(subarr) - except IncompatibleFrequency: - pass + if is_object_dtype(subarr.dtype) and not is_object_dtype(dtype): + inferred = lib.infer_dtype(subarr, skipna=False) + if inferred == "period": + from pandas.core.arrays import period_array + + try: + subarr = period_array(subarr) + except IncompatibleFrequency: + pass return subarr From 3f30c155e15ed2754acf6d3fe693d395ca1e2996 Mon Sep 17 00:00:00 2001 From: "Uwe L. Korn" Date: Thu, 1 Aug 2019 18:28:07 +0200 Subject: [PATCH 3/3] Unify repr --- pandas/tests/extension/arrow/arrays.py | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/pandas/tests/extension/arrow/arrays.py b/pandas/tests/extension/arrow/arrays.py index 3b6b0455f9a5d..6a28f76e474cc 100644 --- a/pandas/tests/extension/arrow/arrays.py +++ b/pandas/tests/extension/arrow/arrays.py @@ -78,6 +78,9 @@ def from_array(cls, arr): def _from_sequence(cls, scalars, dtype=None, copy=False): return cls.from_scalars(scalars) + def __repr__(self): + return "{cls}({data})".format(cls=type(self).__name__, data=repr(self._data)) + def __getitem__(self, item): if pd.api.types.is_scalar(item): return self._data.to_pandas()[item] @@ -162,9 +165,6 @@ def __init__(self, values): self._data = values self._dtype = ArrowBoolDtype() - def __repr__(self): - return "ArrowBoolArray({})".format(repr(self._data)) - class ArrowStringArray(ArrowExtensionArray): def __init__(self, values): @@ -174,6 +174,3 @@ def __init__(self, values): assert values.type == pa.string() self._data = values self._dtype = ArrowStringDtype() - - def __repr__(self): - return "ArrowStringArray({})".format(repr(self._data))