From b453cfd3cb49717fde06cc9b1b87f78f51195dab Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Fri, 11 Dec 2020 14:58:42 +0100 Subject: [PATCH 1/3] Move __from_arrow_ to base class --- pandas/core/arrays/floating.py | 37 +++------------------------------ pandas/core/arrays/integer.py | 37 +++------------------------------ pandas/core/arrays/numeric.py | 38 +++++++++++++++++++++++++++++++++- 3 files changed, 43 insertions(+), 69 deletions(-) diff --git a/pandas/core/arrays/floating.py b/pandas/core/arrays/floating.py index 1077538f6a21d..aedf6adf09db4 100644 --- a/pandas/core/arrays/floating.py +++ b/pandas/core/arrays/floating.py @@ -1,5 +1,5 @@ import numbers -from typing import TYPE_CHECKING, List, Optional, Tuple, Type, Union +from typing import List, Optional, Tuple, Type import warnings import numpy as np @@ -27,13 +27,10 @@ from pandas.core.tools.numeric import to_numeric from .masked import BaseMaskedDtype -from .numeric import NumericArray +from .numeric import NumericArray, NumericDtype -if TYPE_CHECKING: - import pyarrow - -class FloatingDtype(BaseMaskedDtype): +class FloatingDtype(NumericDtype): """ An ExtensionDtype to hold a single size of floating dtype. @@ -72,34 +69,6 @@ def _get_common_dtype(self, dtypes: List[DtypeObj]) -> Optional[DtypeObj]: return FLOAT_STR_TO_DTYPE[str(np_dtype)] return None - def __from_arrow__( - self, array: Union["pyarrow.Array", "pyarrow.ChunkedArray"] - ) -> "FloatingArray": - """ - Construct FloatingArray from pyarrow Array/ChunkedArray. - """ - import pyarrow - - from pandas.core.arrays._arrow_utils import pyarrow_array_to_numpy_and_mask - - pyarrow_type = pyarrow.from_numpy_dtype(self.type) - if not array.type.equals(pyarrow_type): - array = array.cast(pyarrow_type) - - if isinstance(array, pyarrow.Array): - chunks = [array] - else: - # pyarrow.ChunkedArray - chunks = array.chunks - - results = [] - for arr in chunks: - data, mask = pyarrow_array_to_numpy_and_mask(arr, dtype=self.type) - float_arr = FloatingArray(data.copy(), ~mask, copy=False) - results.append(float_arr) - - return FloatingArray._concat_same_type(results) - def coerce_to_array( values, dtype=None, mask=None, copy: bool = False diff --git a/pandas/core/arrays/integer.py b/pandas/core/arrays/integer.py index fa427e94fe08f..01bae9e5086a8 100644 --- a/pandas/core/arrays/integer.py +++ b/pandas/core/arrays/integer.py @@ -1,5 +1,5 @@ import numbers -from typing import TYPE_CHECKING, Dict, List, Optional, Tuple, Type, Union +from typing import Dict, List, Optional, Tuple, Type import warnings import numpy as np @@ -27,13 +27,10 @@ from pandas.core.tools.numeric import to_numeric from .masked import BaseMaskedArray, BaseMaskedDtype -from .numeric import NumericArray +from .numeric import NumericArray, NumericDtype -if TYPE_CHECKING: - import pyarrow - -class _IntegerDtype(BaseMaskedDtype): +class _IntegerDtype(NumericDtype): """ An ExtensionDtype to hold a single size & kind of integer dtype. @@ -92,34 +89,6 @@ def _get_common_dtype(self, dtypes: List[DtypeObj]) -> Optional[DtypeObj]: return FLOAT_STR_TO_DTYPE[str(np_dtype)] return None - def __from_arrow__( - self, array: Union["pyarrow.Array", "pyarrow.ChunkedArray"] - ) -> "IntegerArray": - """ - Construct IntegerArray from pyarrow Array/ChunkedArray. - """ - import pyarrow - - from pandas.core.arrays._arrow_utils import pyarrow_array_to_numpy_and_mask - - pyarrow_type = pyarrow.from_numpy_dtype(self.type) - if not array.type.equals(pyarrow_type): - array = array.cast(pyarrow_type) - - if isinstance(array, pyarrow.Array): - chunks = [array] - else: - # pyarrow.ChunkedArray - chunks = array.chunks - - results = [] - for arr in chunks: - data, mask = pyarrow_array_to_numpy_and_mask(arr, dtype=self.type) - int_arr = IntegerArray(data.copy(), ~mask, copy=False) - results.append(int_arr) - - return IntegerArray._concat_same_type(results) - def integer_array(values, dtype=None, copy: bool = False) -> "IntegerArray": """ diff --git a/pandas/core/arrays/numeric.py b/pandas/core/arrays/numeric.py index 5447a84c86ac1..46243dfbe0272 100644 --- a/pandas/core/arrays/numeric.py +++ b/pandas/core/arrays/numeric.py @@ -1,4 +1,5 @@ import datetime +from typing import TYPE_CHECKING, Union import numpy as np @@ -13,7 +14,42 @@ is_list_like, ) -from .masked import BaseMaskedArray +from .masked import BaseMaskedArray, BaseMaskedDtype + +if TYPE_CHECKING: + import pyarrow + + +class NumericDtype(BaseMaskedDtype): + def __from_arrow__( + self, array: Union["pyarrow.Array", "pyarrow.ChunkedArray"] + ) -> "NumericArray": + """ + Construct IntegerArray/FloatingArray from pyarrow Array/ChunkedArray. + """ + import pyarrow + + from pandas.core.arrays._arrow_utils import pyarrow_array_to_numpy_and_mask + + array_class = self.construct_array_type() + + pyarrow_type = pyarrow.from_numpy_dtype(self.type) + if not array.type.equals(pyarrow_type): + array = array.cast(pyarrow_type) + + if isinstance(array, pyarrow.Array): + chunks = [array] + else: + # pyarrow.ChunkedArray + chunks = array.chunks + + results = [] + for arr in chunks: + data, mask = pyarrow_array_to_numpy_and_mask(arr, dtype=self.type) + num_arr = array_class(data.copy(), ~mask, copy=False) + results.append(num_arr) + + return array_class._concat_same_type(results) class NumericArray(BaseMaskedArray): From 6ce6e18911a57d1f3a16205cf07e81bffdd3fa03 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Fri, 11 Dec 2020 15:08:41 +0100 Subject: [PATCH 2/3] avoid additional copy for non-chunked array --- pandas/core/arrays/numeric.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/pandas/core/arrays/numeric.py b/pandas/core/arrays/numeric.py index 46243dfbe0272..06ad543e0d86d 100644 --- a/pandas/core/arrays/numeric.py +++ b/pandas/core/arrays/numeric.py @@ -49,7 +49,11 @@ def __from_arrow__( num_arr = array_class(data.copy(), ~mask, copy=False) results.append(num_arr) - return array_class._concat_same_type(results) + if len(results) == 1: + # avoid additional copy in _concat_same_type + return results[0] + else: + return array_class._concat_same_type(results) class NumericArray(BaseMaskedArray): From a01f8aaaa803de8ba1c5d4dfb541297112cd194a Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Fri, 11 Dec 2020 16:04:57 +0100 Subject: [PATCH 3/3] typing --- pandas/core/arrays/numeric.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/arrays/numeric.py b/pandas/core/arrays/numeric.py index 06ad543e0d86d..3c115ec42f6ec 100644 --- a/pandas/core/arrays/numeric.py +++ b/pandas/core/arrays/numeric.py @@ -23,7 +23,7 @@ class NumericDtype(BaseMaskedDtype): def __from_arrow__( self, array: Union["pyarrow.Array", "pyarrow.ChunkedArray"] - ) -> "NumericArray": + ) -> BaseMaskedArray: """ Construct IntegerArray/FloatingArray from pyarrow Array/ChunkedArray. """