From 49b096f46c1b5a9b2cbf17058011e73e4870cd0a Mon Sep 17 00:00:00 2001 From: Brock Date: Mon, 29 Nov 2021 18:05:41 -0800 Subject: [PATCH 1/5] REF: hold PeriodArray in NDArrayBackedExtensionBlock --- pandas/core/dtypes/common.py | 7 +++++-- pandas/core/internals/api.py | 4 +++- pandas/core/internals/blocks.py | 11 +++++++++++ pandas/core/internals/construction.py | 2 +- pandas/tests/arithmetic/test_period.py | 3 +++ pandas/tests/extension/base/reshaping.py | 4 ++-- pandas/tests/internals/test_internals.py | 2 +- 7 files changed, 26 insertions(+), 7 deletions(-) diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py index 7ac8e6c47158c..a8ce5f68a682f 100644 --- a/pandas/core/dtypes/common.py +++ b/pandas/core/dtypes/common.py @@ -1399,11 +1399,12 @@ def is_1d_only_ea_obj(obj: Any) -> bool: from pandas.core.arrays import ( DatetimeArray, ExtensionArray, + PeriodArray, TimedeltaArray, ) return isinstance(obj, ExtensionArray) and not isinstance( - obj, (DatetimeArray, TimedeltaArray) + obj, (DatetimeArray, TimedeltaArray, PeriodArray) ) @@ -1415,7 +1416,9 @@ def is_1d_only_ea_dtype(dtype: DtypeObj | None) -> bool: # here too. # NB: need to check DatetimeTZDtype and not is_datetime64tz_dtype # to exclude ArrowTimestampUSDtype - return isinstance(dtype, ExtensionDtype) and not isinstance(dtype, DatetimeTZDtype) + return isinstance(dtype, ExtensionDtype) and not isinstance( + dtype, (DatetimeTZDtype, PeriodDtype) + ) def is_extension_array_dtype(arr_or_dtype) -> bool: diff --git a/pandas/core/internals/api.py b/pandas/core/internals/api.py index fe0b36a8ef4d1..537ae8f2a4320 100644 --- a/pandas/core/internals/api.py +++ b/pandas/core/internals/api.py @@ -15,6 +15,7 @@ from pandas.core.dtypes.common import ( is_datetime64tz_dtype, + is_period_dtype, pandas_dtype, ) @@ -62,8 +63,9 @@ def make_block( placement = BlockPlacement(placement) ndim = maybe_infer_ndim(values, placement, ndim) - if is_datetime64tz_dtype(values.dtype): + if is_datetime64tz_dtype(values.dtype) or is_period_dtype(values.dtype): # GH#41168 ensure we can pass 1D dt64tz values + # More generally, any EA dtype that isn't is_1d_only_ea_dtype values = extract_array(values, extract_numpy=True) values = ensure_block_shape(values, ndim) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 3654f77825ab4..fdf3464dcfb3f 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -58,6 +58,7 @@ CategoricalDtype, ExtensionDtype, PandasDtype, + PeriodDtype, ) from pandas.core.dtypes.generic import ( ABCDataFrame, @@ -1708,6 +1709,11 @@ class NDArrayBackedExtensionBlock(libinternals.NDArrayBackedBlock, EABackedBlock values: NDArrayBackedExtensionArray + @cache_readonly + def is_extension(self) -> bool: + # i.e. datetime64tz, PeriodDtype + return not isinstance(self.dtype, np.dtype) + @property def is_view(self) -> bool: """return a boolean if I am possibly a view""" @@ -1734,6 +1740,9 @@ def where(self, other, cond) -> list[Block]: try: res_values = arr.T._where(cond, other).T except (ValueError, TypeError): + if isinstance(self.dtype, PeriodDtype): + # TODO: don't special-case + raise return Block.where(self, other, cond) nb = self.make_block_same_class(res_values) @@ -1927,6 +1936,8 @@ def get_block_type(dtype: DtypeObj): cls = CategoricalBlock elif vtype is Timestamp: cls = DatetimeTZBlock + elif isinstance(dtype, PeriodDtype): + cls = NDArrayBackedExtensionBlock elif isinstance(dtype, ExtensionDtype): # Note: need to be sure PandasArray is unwrapped before we get here cls = ExtensionBlock diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py index 77f3db0d09df5..9347e9b9109ac 100644 --- a/pandas/core/internals/construction.py +++ b/pandas/core/internals/construction.py @@ -314,7 +314,7 @@ def ndarray_to_mgr( return arrays_to_mgr(values, columns, index, dtype=dtype, typ=typ) elif is_extension_array_dtype(vdtype) and not is_1d_only_ea_dtype(vdtype): - # i.e. Datetime64TZ + # i.e. Datetime64TZ, PeriodDtype values = extract_array(values, extract_numpy=True) if copy: values = values.copy() diff --git a/pandas/tests/arithmetic/test_period.py b/pandas/tests/arithmetic/test_period.py index a231e52d4b027..97f6aa3872c81 100644 --- a/pandas/tests/arithmetic/test_period.py +++ b/pandas/tests/arithmetic/test_period.py @@ -1262,6 +1262,9 @@ def test_parr_add_timedeltalike_scalar(self, three_days, box_with_array): ) obj = tm.box_expected(ser, box_with_array) + if box_with_array is pd.DataFrame: + assert (obj.dtypes == "Period[D]").all() + expected = tm.box_expected(expected, box_with_array) result = obj + three_days diff --git a/pandas/tests/extension/base/reshaping.py b/pandas/tests/extension/base/reshaping.py index b7bb4c95372cc..45bc5b7caaf6f 100644 --- a/pandas/tests/extension/base/reshaping.py +++ b/pandas/tests/extension/base/reshaping.py @@ -5,7 +5,7 @@ import pandas as pd from pandas.api.extensions import ExtensionArray -from pandas.core.internals import ExtensionBlock +from pandas.core.internals.blocks import EABackedBlock from pandas.tests.extension.base.base import BaseExtensionTests @@ -28,7 +28,7 @@ def test_concat(self, data, in_frame): assert dtype == data.dtype if hasattr(result._mgr, "blocks"): - assert isinstance(result._mgr.blocks[0], ExtensionBlock) + assert isinstance(result._mgr.blocks[0], EABackedBlock) assert isinstance(result._mgr.arrays[0], ExtensionArray) @pytest.mark.parametrize("in_frame", [True, False]) diff --git a/pandas/tests/internals/test_internals.py b/pandas/tests/internals/test_internals.py index b577bc7e436df..b9b36f828c357 100644 --- a/pandas/tests/internals/test_internals.py +++ b/pandas/tests/internals/test_internals.py @@ -1275,7 +1275,7 @@ def test_interval_can_hold_element(self, dtype, element): def test_period_can_hold_element_emptylist(self): pi = period_range("2016", periods=3, freq="A") - blk = new_block(pi._data, [1], ndim=2) + blk = new_block(pi._data.reshape(1, 3), [1], ndim=2) assert blk._can_hold_element([]) From d578de3ee5ef994c46c7435964bd9d528718631d Mon Sep 17 00:00:00 2001 From: Brock Date: Wed, 1 Dec 2021 10:36:40 -0800 Subject: [PATCH 2/5] warn for parquet --- pandas/core/internals/managers.py | 29 +++++++++++++++++++++-------- 1 file changed, 21 insertions(+), 8 deletions(-) diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index d69709bf9d06c..bfb743841c349 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -14,6 +14,7 @@ import numpy as np from pandas._libs import ( + Period, internals as libinternals, lib, ) @@ -926,15 +927,27 @@ def __init__( f"Number of Block dimensions ({block.ndim}) must equal " f"number of axes ({self.ndim})" ) - if isinstance(block, DatetimeTZBlock) and block.values.ndim == 1: + if block.values.ndim == 1 and ( + isinstance(block, DatetimeTZBlock) or block.dtype.type is Period + ): # TODO(2.0): remove once fastparquet no longer needs this - warnings.warn( - "In a future version, the BlockManager constructor " - "will assume that a DatetimeTZBlock with block.ndim==2 " - "has block.values.ndim == 2.", - DeprecationWarning, - stacklevel=find_stack_level(), - ) + if isinstance(block, DatetimeTZBlock): + warnings.warn( + "In a future version, the BlockManager constructor " + "will assume that a DatetimeTZBlock with block.ndim==2 " + "has block.values.ndim == 2.", + DeprecationWarning, + stacklevel=find_stack_level(), + ) + else: + warnings.warn( + "In a future version, the BlockManager constructor " + "will assume that a Block backed by PeriodArray with " + "will be NDArrayBackedExtensionBlock and will have " + "block.values.ndim == 2", + DeprecationWarning, + stacklevel=find_stack_level(), + ) # error: Incompatible types in assignment (expression has type # "Union[ExtensionArray, ndarray]", variable has type From d105d4f9582e83bbf759c93732d0f6b0ec2f1c6b Mon Sep 17 00:00:00 2001 From: Brock Date: Thu, 2 Dec 2021 15:05:21 -0800 Subject: [PATCH 3/5] xfail on old pyarrow --- pandas/tests/io/test_feather.py | 8 +++++++- pandas/tests/io/test_parquet.py | 13 ++++++++++++- 2 files changed, 19 insertions(+), 2 deletions(-) diff --git a/pandas/tests/io/test_feather.py b/pandas/tests/io/test_feather.py index 59c7abc4a4cb8..15d41c56c13c1 100644 --- a/pandas/tests/io/test_feather.py +++ b/pandas/tests/io/test_feather.py @@ -2,6 +2,8 @@ import numpy as np import pytest +from pandas.compat.pyarrow import pa_version_under2p0 + import pandas as pd import pandas._testing as tm @@ -85,7 +87,11 @@ def test_basic(self): ), } ) - df["periods"] = pd.period_range("2013", freq="M", periods=3) + if not pa_version_under2p0: + # older pyarrow incorrectly uses pandas internal API, so + # constructs invalid Block + df["periods"] = pd.period_range("2013", freq="M", periods=3) + df["timedeltas"] = pd.timedelta_range("1 day", periods=3) df["intervals"] = pd.interval_range(0, 3, 3) diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index c91356bad12f7..c38a720326c29 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -646,7 +646,15 @@ def test_use_nullable_dtypes(self, engine): "object", "datetime64[ns, UTC]", "float", - "period[D]", + pytest.param( + "period[D]", + # Note: I don't know exactly what version the cutoff is; + # On the CI it fails with 1.0.1 + marks=pytest.mark.xfail( + pa_version_under2p0, + reason="pyarrow uses pandas internal API incorrectly", + ), + ), "Float64", "string", ], @@ -890,6 +898,9 @@ def test_pyarrow_backed_string_array(self, pa, string_storage): check_round_trip(df, pa, expected=df.astype(f"string[{string_storage}]")) @td.skip_if_no("pyarrow") + @pytest.mark.xfail( + pa_version_under2p0, reason="pyarrow uses pandas internal API incorrectly" + ) def test_additional_extension_types(self, pa): # test additional ExtensionArrays that are supported through the # __arrow_array__ protocol + by defining a custom ExtensionType From c2cdc34d339114c9d734ffac62d90b76ef4e6202 Mon Sep 17 00:00:00 2001 From: Brock Date: Sat, 4 Dec 2021 13:24:54 -0800 Subject: [PATCH 4/5] mypy fixup, xfail on old pyarrow --- pandas/core/internals/blocks.py | 3 ++- pandas/core/internals/managers.py | 7 +------ pandas/tests/arrays/period/test_arrow_compat.py | 11 +++++++++++ 3 files changed, 14 insertions(+), 7 deletions(-) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 84421bf7ebbe1..0081bdd891dad 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -1721,8 +1721,9 @@ class NDArrayBackedExtensionBlock(libinternals.NDArrayBackedBlock, EABackedBlock values: NDArrayBackedExtensionArray + # error: Signature of "is_extension" incompatible with supertype "Block" @cache_readonly - def is_extension(self) -> bool: + def is_extension(self) -> bool: # type: ignore[override] # i.e. datetime64tz, PeriodDtype return not isinstance(self.dtype, np.dtype) diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index bfb743841c349..80650568d67ed 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -949,12 +949,7 @@ def __init__( stacklevel=find_stack_level(), ) - # error: Incompatible types in assignment (expression has type - # "Union[ExtensionArray, ndarray]", variable has type - # "DatetimeArray") - block.values = ensure_block_shape( # type: ignore[assignment] - block.values, self.ndim - ) + block.values = ensure_block_shape(block.values, self.ndim) try: block._cache.clear() except AttributeError: diff --git a/pandas/tests/arrays/period/test_arrow_compat.py b/pandas/tests/arrays/period/test_arrow_compat.py index 560299a4a47f5..6066d49b68489 100644 --- a/pandas/tests/arrays/period/test_arrow_compat.py +++ b/pandas/tests/arrays/period/test_arrow_compat.py @@ -1,5 +1,7 @@ import pytest +from pandas.compat import pa_version_under2p0 + from pandas.core.dtypes.dtypes import PeriodDtype import pandas as pd @@ -69,6 +71,9 @@ def test_arrow_array_missing(): assert result.storage.equals(expected) +@pytest.mark.xfail( + pa_version_under2p0, reason="pyarrow incorrectly uses pandas internals API" +) def test_arrow_table_roundtrip(): from pandas.core.arrays._arrow_utils import ArrowPeriodType @@ -88,6 +93,9 @@ def test_arrow_table_roundtrip(): tm.assert_frame_equal(result, expected) +@pytest.mark.xfail( + pa_version_under2p0, reason="pyarrow incorrectly uses pandas internals API" +) def test_arrow_load_from_zero_chunks(): # GH-41040 @@ -106,6 +114,9 @@ def test_arrow_load_from_zero_chunks(): tm.assert_frame_equal(result, df) +@pytest.mark.xfail( + pa_version_under2p0, reason="pyarrow incorrectly uses pandas internals API" +) def test_arrow_table_roundtrip_without_metadata(): arr = PeriodArray([1, 2, 3], freq="H") arr[1] = pd.NaT From acf4bfe3e82f09ab27b19b7137d613b6930d9d07 Mon Sep 17 00:00:00 2001 From: Brock Date: Sat, 4 Dec 2021 18:14:27 -0800 Subject: [PATCH 5/5] revert warning handled by xfail --- pandas/core/internals/managers.py | 36 ++++++++++++------------------- 1 file changed, 14 insertions(+), 22 deletions(-) diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 80650568d67ed..d69709bf9d06c 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -14,7 +14,6 @@ import numpy as np from pandas._libs import ( - Period, internals as libinternals, lib, ) @@ -927,29 +926,22 @@ def __init__( f"Number of Block dimensions ({block.ndim}) must equal " f"number of axes ({self.ndim})" ) - if block.values.ndim == 1 and ( - isinstance(block, DatetimeTZBlock) or block.dtype.type is Period - ): + if isinstance(block, DatetimeTZBlock) and block.values.ndim == 1: # TODO(2.0): remove once fastparquet no longer needs this - if isinstance(block, DatetimeTZBlock): - warnings.warn( - "In a future version, the BlockManager constructor " - "will assume that a DatetimeTZBlock with block.ndim==2 " - "has block.values.ndim == 2.", - DeprecationWarning, - stacklevel=find_stack_level(), - ) - else: - warnings.warn( - "In a future version, the BlockManager constructor " - "will assume that a Block backed by PeriodArray with " - "will be NDArrayBackedExtensionBlock and will have " - "block.values.ndim == 2", - DeprecationWarning, - stacklevel=find_stack_level(), - ) + warnings.warn( + "In a future version, the BlockManager constructor " + "will assume that a DatetimeTZBlock with block.ndim==2 " + "has block.values.ndim == 2.", + DeprecationWarning, + stacklevel=find_stack_level(), + ) - block.values = ensure_block_shape(block.values, self.ndim) + # error: Incompatible types in assignment (expression has type + # "Union[ExtensionArray, ndarray]", variable has type + # "DatetimeArray") + block.values = ensure_block_shape( # type: ignore[assignment] + block.values, self.ndim + ) try: block._cache.clear() except AttributeError: