From 63091ac07a2b8cacc35dc3b96c394029d808ef24 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Thu, 25 Aug 2022 14:05:29 -0700 Subject: [PATCH 01/18] BUG: is_datetime64tz_dtype shortcut only for DatetimeTZDtype --- doc/source/reference/arrays.rst | 4 ++ doc/source/whatsnew/v1.5.0.rst | 2 + pandas/core/arrays/arrow/array.py | 7 ++ pandas/core/dtypes/common.py | 5 +- pandas/tests/dtypes/test_common.py | 11 ++++ pandas/tests/extension/test_arrow.py | 98 +++++----------------------- 6 files changed, 42 insertions(+), 85 deletions(-) diff --git a/doc/source/reference/arrays.rst b/doc/source/reference/arrays.rst index 6d09e10f284af..0537cbb7c11a5 100644 --- a/doc/source/reference/arrays.rst +++ b/doc/source/reference/arrays.rst @@ -52,6 +52,10 @@ PyArrow This feature is experimental, and the API can change in a future release without warning. +The ``dtype`` argument of :class:`Series` and :class:`DataFrame` can accept a string of a `pyarrow data type `__ +with ``pyarrow`` in brackets e.g. ``"int64[pyarrow]"`` or, for pyarrow data types that take parameters, a :class:`ArrowDtype` +initialized with a ``pyarrow.DataType``. + The :class:`arrays.ArrowExtensionArray` is backed by a :external+pyarrow:py:class:`pyarrow.ChunkedArray` with a :external+pyarrow:py:class:`pyarrow.DataType` instead of a NumPy array and data type. The ``.dtype`` of a :class:`arrays.ArrowExtensionArray` is an :class:`ArrowDtype`. diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst index 9de855dea407d..247859b290630 100644 --- a/doc/source/whatsnew/v1.5.0.rst +++ b/doc/source/whatsnew/v1.5.0.rst @@ -54,6 +54,8 @@ initialized with a ``pyarrow.DataType``. Most operations are supported and have been implemented using `pyarrow compute `__ functions. We recommend installing the latest version of PyArrow to access the most recently implemented compute functions. +Please view the :ref:`API reference ` for more information. + .. warning:: This feature is experimental, and the API can change in a future release without warning. diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 1f7939011a1f1..0ca91f06406bb 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -199,6 +199,13 @@ class ArrowExtensionArray(OpsMixin, ExtensionArray): [1, 1, ] Length: 3, dtype: int64[pyarrow] + + Create a ArrowExtensionArray directly from an pyarrow array. + >>> import pyarrow as pa + >>> pd.arrays.ArrowExtensionArray(pa.array([1, 1, None])) + + [1, 1, ] + Length: 3, dtype: int64[pyarrow] """ # noqa: E501 (http link too long) _data: pa.ChunkedArray diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py index 1173703386491..9077e7f693137 100644 --- a/pandas/core/dtypes/common.py +++ b/pandas/core/dtypes/common.py @@ -379,9 +379,10 @@ def is_datetime64tz_dtype(arr_or_dtype) -> bool: >>> is_datetime64tz_dtype(s) True """ - if isinstance(arr_or_dtype, ExtensionDtype): + if isinstance(arr_or_dtype, DatetimeTZDtype): # GH#33400 fastpath for dtype object - return arr_or_dtype.kind == "M" + # GH 34986 + return True if arr_or_dtype is None: return False diff --git a/pandas/tests/dtypes/test_common.py b/pandas/tests/dtypes/test_common.py index 984655c68d56b..f2f977dfd7f28 100644 --- a/pandas/tests/dtypes/test_common.py +++ b/pandas/tests/dtypes/test_common.py @@ -13,6 +13,7 @@ CategoricalDtype, CategoricalDtypeType, DatetimeTZDtype, + ExtensionDtype, IntervalDtype, PeriodDtype, ) @@ -239,6 +240,16 @@ def test_is_datetime64tz_dtype(): assert com.is_datetime64tz_dtype(pd.DatetimeIndex(["2000"], tz="US/Eastern")) +def test_custom_ea_kind_M_not_datetime64tz(): + # GH 34986 + class NotTZDtype(ExtensionDtype): + @property + def kind(self) -> str: + return "M" + + assert not com.is_datetime64tz_dtype(NotTZDtype()) + + def test_is_timedelta64_dtype(): assert not com.is_timedelta64_dtype(object) assert not com.is_timedelta64_dtype(None) diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index 43c52ef8848e2..b80cfac8c829b 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -420,15 +420,6 @@ def test_groupby_extension_no_sort(self, data_for_grouping, request): reason=f"pyarrow doesn't support factorizing {pa_dtype}", ) ) - elif pa.types.is_date(pa_dtype) or ( - pa.types.is_timestamp(pa_dtype) and pa_dtype.tz is None - ): - request.node.add_marker( - pytest.mark.xfail( - raises=AttributeError, - reason="GH 34986", - ) - ) super().test_groupby_extension_no_sort(data_for_grouping) def test_groupby_extension_transform(self, data_for_grouping, request): @@ -453,7 +444,6 @@ def test_groupby_extension_apply( ): pa_dtype = data_for_grouping.dtype.pyarrow_dtype # Is there a better way to get the "series" ID for groupby_apply_op? - is_series = "series" in request.node.nodeid is_object = "object" in request.node.nodeid if pa.types.is_duration(pa_dtype): request.node.add_marker( @@ -472,13 +462,6 @@ def test_groupby_extension_apply( reason="GH 47514: _concat_datetime expects axis arg.", ) ) - elif not is_series: - request.node.add_marker( - pytest.mark.xfail( - raises=AttributeError, - reason="GH 34986", - ) - ) super().test_groupby_extension_apply(data_for_grouping, groupby_apply_op) def test_in_numeric_groupby(self, data_for_grouping, request): @@ -508,16 +491,6 @@ def test_groupby_extension_agg(self, as_index, data_for_grouping, request): reason=f"pyarrow doesn't support factorizing {pa_dtype}", ) ) - elif as_index is True and ( - pa.types.is_date(pa_dtype) - or (pa.types.is_timestamp(pa_dtype) and pa_dtype.tz is None) - ): - request.node.add_marker( - pytest.mark.xfail( - raises=AttributeError, - reason="GH 34986", - ) - ) super().test_groupby_extension_agg(as_index, data_for_grouping) @@ -752,22 +725,17 @@ def test_concat_extension_arrays_copy_false(self, data, na_value, request): def test_concat_with_reindex(self, data, request, using_array_manager): pa_dtype = data.dtype.pyarrow_dtype - if pa.types.is_duration(pa_dtype): + if ( + pa.types.is_duration(pa_dtype) + or pa.types.is_date(pa_dtype) + or (pa.types.is_timestamp(pa_dtype) and pa_dtype.tz is None) + ): request.node.add_marker( pytest.mark.xfail( raises=TypeError, reason="GH 47514: _concat_datetime expects axis arg.", ) ) - elif pa.types.is_date(pa_dtype) or ( - pa.types.is_timestamp(pa_dtype) and pa_dtype.tz is None - ): - request.node.add_marker( - pytest.mark.xfail( - raises=AttributeError if not using_array_manager else TypeError, - reason="GH 34986", - ) - ) super().test_concat_with_reindex(data) def test_align(self, data, na_value, request): @@ -810,32 +778,6 @@ def test_merge(self, data, na_value, request): ) super().test_merge(data, na_value) - def test_merge_on_extension_array(self, data, request): - pa_dtype = data.dtype.pyarrow_dtype - if pa.types.is_date(pa_dtype) or ( - pa.types.is_timestamp(pa_dtype) and pa_dtype.tz is None - ): - request.node.add_marker( - pytest.mark.xfail( - raises=AttributeError, - reason="GH 34986", - ) - ) - super().test_merge_on_extension_array(data) - - def test_merge_on_extension_array_duplicates(self, data, request): - pa_dtype = data.dtype.pyarrow_dtype - if pa.types.is_date(pa_dtype) or ( - pa.types.is_timestamp(pa_dtype) and pa_dtype.tz is None - ): - request.node.add_marker( - pytest.mark.xfail( - raises=AttributeError, - reason="GH 34986", - ) - ) - super().test_merge_on_extension_array_duplicates(data) - def test_ravel(self, data, request): tz = getattr(data.dtype.pyarrow_dtype, "tz", None) if pa_version_under2p0 and tz not in (None, "UTC"): @@ -1348,16 +1290,7 @@ def test_diff(self, data, periods, request): @pytest.mark.parametrize("dropna", [True, False]) def test_value_counts(self, all_data, dropna, request): pa_dtype = all_data.dtype.pyarrow_dtype - if pa.types.is_date(pa_dtype) or ( - pa.types.is_timestamp(pa_dtype) and pa_dtype.tz is None - ): - request.node.add_marker( - pytest.mark.xfail( - raises=AttributeError, - reason="GH 34986", - ) - ) - elif pa.types.is_duration(pa_dtype): + if pa.types.is_duration(pa_dtype): request.node.add_marker( pytest.mark.xfail( raises=pa.ArrowNotImplementedError, @@ -1368,16 +1301,7 @@ def test_value_counts(self, all_data, dropna, request): def test_value_counts_with_normalize(self, data, request): pa_dtype = data.dtype.pyarrow_dtype - if pa.types.is_date(pa_dtype) or ( - pa.types.is_timestamp(pa_dtype) and pa_dtype.tz is None - ): - request.node.add_marker( - pytest.mark.xfail( - raises=AttributeError, - reason="GH 34986", - ) - ) - elif pa.types.is_duration(pa_dtype): + if pa.types.is_duration(pa_dtype): request.node.add_marker( pytest.mark.xfail( raises=pa.ArrowNotImplementedError, @@ -2063,3 +1987,11 @@ def test_mode(data_for_grouping, dropna, take_idx, exp_idx, request): result = ser.mode(dropna=dropna) expected = pd.Series(data_for_grouping.take(exp_idx)) tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize("box", ["Series", "DataFrame"]) +def test_repr_from_arrow_array(data, box): + # GH 34986 & 48238 + pa_array = pa.array([data[0], None]) + result = getattr(pd, box)(pa_array, dtype=ArrowDtype(pa_array.type)) + repr(result) From 99bfd65878bb3086274068ab793764bd71c837c8 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Thu, 25 Aug 2022 14:09:55 -0700 Subject: [PATCH 02/18] add another whatsnew note --- doc/source/whatsnew/v1.5.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst index 247859b290630..82e9c60a37c4d 100644 --- a/doc/source/whatsnew/v1.5.0.rst +++ b/doc/source/whatsnew/v1.5.0.rst @@ -1031,7 +1031,7 @@ Conversion - Bug in :meth:`DataFrame.apply` that returns a :class:`DataFrame` instead of a :class:`Series` when applied to an empty :class:`DataFrame` and ``axis=1`` (:issue:`39111`) - Bug when inferring the dtype from an iterable that is *not* a NumPy ``ndarray`` consisting of all NumPy unsigned integer scalars did not result in an unsigned integer dtype (:issue:`47294`) - Bug in :meth:`DataFrame.eval` when pandas objects (e.g. ``'Timestamp'``) were column names (:issue:`44603`) -- +- Bug where any :class:`ExtensionDtype` subclass with ``kind="M"`` would be interpreted as a timezone type (:issue:`34986`) Strings ^^^^^^^ From 46dde84e827722ae7db0dc972f44cfa772397395 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Thu, 25 Aug 2022 16:16:18 -0700 Subject: [PATCH 03/18] Use frame_or_series --- pandas/tests/extension/test_arrow.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index b80cfac8c829b..8d44d8b5a09c5 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -1989,9 +1989,8 @@ def test_mode(data_for_grouping, dropna, take_idx, exp_idx, request): tm.assert_series_equal(result, expected) -@pytest.mark.parametrize("box", ["Series", "DataFrame"]) -def test_repr_from_arrow_array(data, box): +def test_repr_from_arrow_array(data, frame_or_series): # GH 34986 & 48238 pa_array = pa.array([data[0], None]) - result = getattr(pd, box)(pa_array, dtype=ArrowDtype(pa_array.type)) + result = frame_or_series(pa_array, dtype=ArrowDtype(pa_array.type)) repr(result) From 4dfb27c474a86171562dbc98bbc0b9c9c19cbbde Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Mon, 29 Aug 2022 15:46:43 -0700 Subject: [PATCH 04/18] more specific warning condition --- pandas/tests/extension/test_arrow.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index dc95a8372c5c0..e93f436ed2bca 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -1331,7 +1331,13 @@ def test_value_counts_with_normalize(self, data, request): ) ) with tm.maybe_produces_warning( - PerformanceWarning, pa_version_under7p0, check_stacklevel=False + PerformanceWarning, + pa_version_under7p0 + and not ( + pa.types.is_date(pa_dtype) + or (pa.types.is_timestamp(pa_dtype) and pa_dtype.tz is None) + ), + check_stacklevel=False, ): super().test_value_counts_with_normalize(data) From 13f5f3a3c2afe01a0bb8870cbd484741b90a19aa Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Thu, 1 Sep 2022 17:28:30 -0700 Subject: [PATCH 05/18] Try something random --- pandas/core/arrays/arrow/array.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 0ca91f06406bb..37299ac1a4d0b 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -469,6 +469,12 @@ def isna(self) -> npt.NDArray[np.bool_]: else: return self._data.is_null().to_numpy() + def _values_for_argsort(self) -> np.ndarray: + if pa_version_under2p0: + return self._data.to_pandas().values + else: + return self._data.to_numpy() + @deprecate_nonkeyword_arguments(version=None, allowed_args=["self"]) def argsort( self, From cce9a1275ffd53c920819ca77fab84fbcf92c794 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Fri, 2 Sep 2022 10:33:06 -0700 Subject: [PATCH 06/18] Remove random try --- pandas/core/arrays/arrow/array.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 37299ac1a4d0b..0ca91f06406bb 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -469,12 +469,6 @@ def isna(self) -> npt.NDArray[np.bool_]: else: return self._data.is_null().to_numpy() - def _values_for_argsort(self) -> np.ndarray: - if pa_version_under2p0: - return self._data.to_pandas().values - else: - return self._data.to_numpy() - @deprecate_nonkeyword_arguments(version=None, allowed_args=["self"]) def argsort( self, From 8fe8de292e273372825ebb23e7090131aeeb026a Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Fri, 2 Sep 2022 10:42:16 -0700 Subject: [PATCH 07/18] try skipping arg* tests for pyarrow? --- pandas/tests/extension/test_arrow.py | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index e93f436ed2bca..0af3ee7ac762b 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -1288,12 +1288,21 @@ def test_invert(self, data, request): class TestBaseMethods(base.BaseMethodsTests): + @pytest.mark.skip(reason="TEST") + def test_argsort(self, data_for_sorting): + super().test_argsort(data_for_sorting) + + @pytest.mark.skip(reason="TEST") def test_argsort_missing_array(self, data_missing_for_sorting): with tm.maybe_produces_warning( PerformanceWarning, pa_version_under7p0, check_stacklevel=False ): super().test_argsort_missing_array(data_missing_for_sorting) + @pytest.mark.skip(reason="TEST") + def test_argsort_missing(self, data_missing_for_sorting): + super().test_argsort_missing(data_missing_for_sorting) + @pytest.mark.parametrize("periods", [1, -2]) def test_diff(self, data, periods, request): pa_dtype = data.dtype.pyarrow_dtype @@ -1341,6 +1350,7 @@ def test_value_counts_with_normalize(self, data, request): ): super().test_value_counts_with_normalize(data) + @pytest.mark.skip(reason="TEST") @pytest.mark.xfail( pa_version_under6p0, raises=NotImplementedError, @@ -1365,6 +1375,17 @@ def test_argmin_argmax( ) super().test_argmin_argmax(data_for_sorting, data_missing_for_sorting, na_value) + @pytest.mark.skip(reason="TEST") + @pytest.mark.parametrize("method", ["argmax", "argmin"]) + def test_argmin_argmax_empty_array(self, method, data): + super().test_argmin_argmax_empty_array(method, data) + + @pytest.mark.skip(reason="TEST") + @pytest.mark.parametrize("method", ["argmax", "argmin"]) + def test_argmin_argmax_all_na(self, method, data, na_value): + super().test_argmin_argmax_all_na(method, data, na_value) + + @pytest.mark.skip(reason="TEST") @pytest.mark.parametrize( "op_name, skipna, expected", [ @@ -1400,6 +1421,11 @@ def test_argreduce_series( data_missing_for_sorting, op_name, skipna, expected ) + @pytest.mark.skip + def test_argmax_argmin_no_skipna_notimplemented(self, data_missing_for_sorting): + super().test_argmax_argmin_no_skipna_notimplemented(data_missing_for_sorting) + + @pytest.mark.skip @pytest.mark.parametrize( "na_position, expected", [ From 2f55e02e69263a7eb7d323a32a9595f30533b142 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Wed, 7 Sep 2022 13:41:24 -0700 Subject: [PATCH 08/18] Add test for needs_i8_conversion --- pandas/tests/dtypes/test_common.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pandas/tests/dtypes/test_common.py b/pandas/tests/dtypes/test_common.py index f2f977dfd7f28..753e39c1f7937 100644 --- a/pandas/tests/dtypes/test_common.py +++ b/pandas/tests/dtypes/test_common.py @@ -247,7 +247,9 @@ class NotTZDtype(ExtensionDtype): def kind(self) -> str: return "M" - assert not com.is_datetime64tz_dtype(NotTZDtype()) + not_tz_dtype = NotTZDtype() + assert not com.is_datetime64tz_dtype(not_tz_dtype) + assert com.needs_i8_conversion(not_tz_dtype) def test_is_timedelta64_dtype(): From e83c00749a3871871709f6a67c8f9a7e7af8fcba Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Fri, 9 Sep 2022 13:59:54 -0700 Subject: [PATCH 09/18] Force colors for failures? --- .github/workflows/macos-windows.yml | 1 + .github/workflows/ubuntu.yml | 1 + 2 files changed, 2 insertions(+) diff --git a/.github/workflows/macos-windows.yml b/.github/workflows/macos-windows.yml index 8b3d69943bd9d..f08327c83cebc 100644 --- a/.github/workflows/macos-windows.yml +++ b/.github/workflows/macos-windows.yml @@ -41,6 +41,7 @@ jobs: group: ${{ github.event_name == 'push' && github.run_number || github.ref }}-${{ matrix.env_file }}-${{ matrix.os }} cancel-in-progress: true env: + FORCE_COLOR: "1" # GH 47443: PYTEST_WORKERS > 1 crashes Windows builds with memory related errors PYTEST_WORKERS: ${{ matrix.os == 'macos-latest' && 'auto' || '1' }} diff --git a/.github/workflows/ubuntu.yml b/.github/workflows/ubuntu.yml index 260c857e608b3..215fcd9b4c90c 100644 --- a/.github/workflows/ubuntu.yml +++ b/.github/workflows/ubuntu.yml @@ -105,6 +105,7 @@ jobs: IS_PYPY: ${{ contains(matrix.env_file, 'pypy') }} # TODO: re-enable coverage on pypy, its slow COVERAGE: ${{ !contains(matrix.env_file, 'pypy') }} + FORCE_COLOR: "1" concurrency: # https://github.community/t/concurrecy-not-work-for-push/183068/7 group: ${{ github.event_name == 'push' && github.run_number || github.ref }}-${{ matrix.env_file }}-${{ matrix.pattern }}-${{ matrix.pyarrow_version || '' }}-${{ matrix.extra_apt || '' }}-${{ matrix.pandas_data_manager || '' }} From 011201d46d6b9d2b2cd17d7c850fdd301dcbde64 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Fri, 9 Sep 2022 15:51:19 -0700 Subject: [PATCH 10/18] PY_COLOR? --- .github/workflows/macos-windows.yml | 2 +- .github/workflows/ubuntu.yml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/macos-windows.yml b/.github/workflows/macos-windows.yml index f08327c83cebc..89ac00dc68880 100644 --- a/.github/workflows/macos-windows.yml +++ b/.github/workflows/macos-windows.yml @@ -41,7 +41,7 @@ jobs: group: ${{ github.event_name == 'push' && github.run_number || github.ref }}-${{ matrix.env_file }}-${{ matrix.os }} cancel-in-progress: true env: - FORCE_COLOR: "1" + PY_COLORS: "1" # GH 47443: PYTEST_WORKERS > 1 crashes Windows builds with memory related errors PYTEST_WORKERS: ${{ matrix.os == 'macos-latest' && 'auto' || '1' }} diff --git a/.github/workflows/ubuntu.yml b/.github/workflows/ubuntu.yml index 215fcd9b4c90c..cf17ad959b528 100644 --- a/.github/workflows/ubuntu.yml +++ b/.github/workflows/ubuntu.yml @@ -105,7 +105,7 @@ jobs: IS_PYPY: ${{ contains(matrix.env_file, 'pypy') }} # TODO: re-enable coverage on pypy, its slow COVERAGE: ${{ !contains(matrix.env_file, 'pypy') }} - FORCE_COLOR: "1" + PY_COLORS: "1" concurrency: # https://github.community/t/concurrecy-not-work-for-push/183068/7 group: ${{ github.event_name == 'push' && github.run_number || github.ref }}-${{ matrix.env_file }}-${{ matrix.pattern }}-${{ matrix.pyarrow_version || '' }}-${{ matrix.extra_apt || '' }}-${{ matrix.pandas_data_manager || '' }} From 911c46e720b3760dc6c2f35caf8709e688164495 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Fri, 9 Sep 2022 17:04:04 -0700 Subject: [PATCH 11/18] See if no color logs errors --- .github/workflows/macos-windows.yml | 1 - .github/workflows/ubuntu.yml | 1 - pyproject.toml | 2 +- 3 files changed, 1 insertion(+), 3 deletions(-) diff --git a/.github/workflows/macos-windows.yml b/.github/workflows/macos-windows.yml index 89ac00dc68880..8b3d69943bd9d 100644 --- a/.github/workflows/macos-windows.yml +++ b/.github/workflows/macos-windows.yml @@ -41,7 +41,6 @@ jobs: group: ${{ github.event_name == 'push' && github.run_number || github.ref }}-${{ matrix.env_file }}-${{ matrix.os }} cancel-in-progress: true env: - PY_COLORS: "1" # GH 47443: PYTEST_WORKERS > 1 crashes Windows builds with memory related errors PYTEST_WORKERS: ${{ matrix.os == 'macos-latest' && 'auto' || '1' }} diff --git a/.github/workflows/ubuntu.yml b/.github/workflows/ubuntu.yml index cf17ad959b528..260c857e608b3 100644 --- a/.github/workflows/ubuntu.yml +++ b/.github/workflows/ubuntu.yml @@ -105,7 +105,6 @@ jobs: IS_PYPY: ${{ contains(matrix.env_file, 'pypy') }} # TODO: re-enable coverage on pypy, its slow COVERAGE: ${{ !contains(matrix.env_file, 'pypy') }} - PY_COLORS: "1" concurrency: # https://github.community/t/concurrecy-not-work-for-push/183068/7 group: ${{ github.event_name == 'push' && github.run_number || github.ref }}-${{ matrix.env_file }}-${{ matrix.pattern }}-${{ matrix.pyarrow_version || '' }}-${{ matrix.extra_apt || '' }}-${{ matrix.pandas_data_manager || '' }} diff --git a/pyproject.toml b/pyproject.toml index 97a0d30092804..89afdcfd32d22 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -34,7 +34,7 @@ exclude = ''' [tool.pytest.ini_options] # sync minversion with pyproject.toml & install.rst minversion = "6.0" -addopts = "--strict-data-files --strict-markers --strict-config --capture=no --durations=30 --junitxml=test-data.xml --color=yes" +addopts = "--strict-data-files --strict-markers --strict-config --capture=no --durations=30 --junitxml=test-data.xml" xfail_strict = true testpaths = "pandas" doctest_optionflags = [ From 88d2e2cd943889574c8f40e402a8dde0804cc56c Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Tue, 13 Sep 2022 11:10:44 -0700 Subject: [PATCH 12/18] Undo skips --- pandas/tests/extension/test_arrow.py | 25 ------------------------- 1 file changed, 25 deletions(-) diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index ebcf4298deb76..fa6242a5eb6d9 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -1139,21 +1139,12 @@ def test_invert(self, data, request): class TestBaseMethods(base.BaseMethodsTests): - @pytest.mark.skip(reason="TEST") - def test_argsort(self, data_for_sorting): - super().test_argsort(data_for_sorting) - - @pytest.mark.skip(reason="TEST") def test_argsort_missing_array(self, data_missing_for_sorting): with tm.maybe_produces_warning( PerformanceWarning, pa_version_under7p0, check_stacklevel=False ): super().test_argsort_missing_array(data_missing_for_sorting) - @pytest.mark.skip(reason="TEST") - def test_argsort_missing(self, data_missing_for_sorting): - super().test_argsort_missing(data_missing_for_sorting) - @pytest.mark.parametrize("periods", [1, -2]) def test_diff(self, data, periods, request): pa_dtype = data.dtype.pyarrow_dtype @@ -1201,7 +1192,6 @@ def test_value_counts_with_normalize(self, data, request): ): super().test_value_counts_with_normalize(data) - @pytest.mark.skip(reason="TEST") @pytest.mark.xfail( pa_version_under6p0, raises=NotImplementedError, @@ -1226,17 +1216,6 @@ def test_argmin_argmax( ) super().test_argmin_argmax(data_for_sorting, data_missing_for_sorting, na_value) - @pytest.mark.skip(reason="TEST") - @pytest.mark.parametrize("method", ["argmax", "argmin"]) - def test_argmin_argmax_empty_array(self, method, data): - super().test_argmin_argmax_empty_array(method, data) - - @pytest.mark.skip(reason="TEST") - @pytest.mark.parametrize("method", ["argmax", "argmin"]) - def test_argmin_argmax_all_na(self, method, data, na_value): - super().test_argmin_argmax_all_na(method, data, na_value) - - @pytest.mark.skip(reason="TEST") @pytest.mark.parametrize( "op_name, skipna, expected", [ @@ -1272,10 +1251,6 @@ def test_argreduce_series( data_missing_for_sorting, op_name, skipna, expected ) - @pytest.mark.skip - def test_argmax_argmin_no_skipna_notimplemented(self, data_missing_for_sorting): - super().test_argmax_argmin_no_skipna_notimplemented(data_missing_for_sorting) - @pytest.mark.skip @pytest.mark.parametrize( "na_position, expected", From ddf8f61e0a897ee39a3d32dbe8101ac4977b99e0 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Tue, 13 Sep 2022 11:12:24 -0700 Subject: [PATCH 13/18] Missed skip --- pandas/tests/extension/test_arrow.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index fa6242a5eb6d9..148a663a68b49 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -1251,7 +1251,6 @@ def test_argreduce_series( data_missing_for_sorting, op_name, skipna, expected ) - @pytest.mark.skip @pytest.mark.parametrize( "na_position, expected", [ From 17b78e949d4b99d05e7c240118bc08dfd231ffa8 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Tue, 13 Sep 2022 13:27:24 -0700 Subject: [PATCH 14/18] Remove test --- pandas/tests/dtypes/test_common.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/tests/dtypes/test_common.py b/pandas/tests/dtypes/test_common.py index 753e39c1f7937..b47e4881be4c7 100644 --- a/pandas/tests/dtypes/test_common.py +++ b/pandas/tests/dtypes/test_common.py @@ -249,7 +249,6 @@ def kind(self) -> str: not_tz_dtype = NotTZDtype() assert not com.is_datetime64tz_dtype(not_tz_dtype) - assert com.needs_i8_conversion(not_tz_dtype) def test_is_timedelta64_dtype(): From e05114464676cdfc3fdf5eca46caebb781753efb Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Tue, 13 Sep 2022 13:28:51 -0700 Subject: [PATCH 15/18] Undo test removal --- pandas/tests/dtypes/test_common.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/tests/dtypes/test_common.py b/pandas/tests/dtypes/test_common.py index b47e4881be4c7..61c2b58388cc2 100644 --- a/pandas/tests/dtypes/test_common.py +++ b/pandas/tests/dtypes/test_common.py @@ -249,6 +249,7 @@ def kind(self) -> str: not_tz_dtype = NotTZDtype() assert not com.is_datetime64tz_dtype(not_tz_dtype) + assert not com.needs_i8_conversion(not_tz_dtype) def test_is_timedelta64_dtype(): From 21d152c6af3334a5e6918724758f82eba09b1c5a Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Tue, 13 Sep 2022 16:08:38 -0700 Subject: [PATCH 16/18] Try not testing pyarrow timedelta types --- pandas/_testing/__init__.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/_testing/__init__.py b/pandas/_testing/__init__.py index 1035fd08a1a36..3f60b39980f36 100644 --- a/pandas/_testing/__init__.py +++ b/pandas/_testing/__init__.py @@ -215,7 +215,7 @@ for unit in ["s", "ms", "us", "ns"] for tz in [None, "UTC", "US/Pacific", "US/Eastern"] ] - TIMEDELTA_PYARROW_DTYPES = [pa.duration(unit) for unit in ["s", "ms", "us", "ns"]] + # TIMEDELTA_PYARROW_DTYPES = [pa.duration(unit) for unit in ["s", "ms", "us", "ns"]] BOOL_PYARROW_DTYPES = [pa.bool_()] @@ -227,7 +227,7 @@ + TIME_PYARROW_DTYPES + DATE_PYARROW_DTYPES + DATETIME_PYARROW_DTYPES - + TIMEDELTA_PYARROW_DTYPES + # + TIMEDELTA_PYARROW_DTYPES + BOOL_PYARROW_DTYPES ) From 161ad6d47475bfc814062ffe33524908836d4746 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Tue, 13 Sep 2022 20:15:07 -0700 Subject: [PATCH 17/18] Revert the is_datetime64tz_dtype change --- doc/source/whatsnew/v1.5.0.rst | 1 - pandas/_testing/__init__.py | 4 ++-- pandas/core/dtypes/common.py | 5 ++--- pandas/tests/dtypes/test_common.py | 13 ------------- pandas/tests/extension/test_arrow.py | 10 ++-------- pyproject.toml | 2 +- 6 files changed, 7 insertions(+), 28 deletions(-) diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst index eb307db8e602f..3f45b6393bfb7 100644 --- a/doc/source/whatsnew/v1.5.0.rst +++ b/doc/source/whatsnew/v1.5.0.rst @@ -1036,7 +1036,6 @@ Conversion - Bug in :meth:`DataFrame.apply` that returns a :class:`DataFrame` instead of a :class:`Series` when applied to an empty :class:`DataFrame` and ``axis=1`` (:issue:`39111`) - Bug when inferring the dtype from an iterable that is *not* a NumPy ``ndarray`` consisting of all NumPy unsigned integer scalars did not result in an unsigned integer dtype (:issue:`47294`) - Bug in :meth:`DataFrame.eval` when pandas objects (e.g. ``'Timestamp'``) were column names (:issue:`44603`) -- Bug where any :class:`ExtensionDtype` subclass with ``kind="M"`` would be interpreted as a timezone type (:issue:`34986`) Strings ^^^^^^^ diff --git a/pandas/_testing/__init__.py b/pandas/_testing/__init__.py index 3f60b39980f36..1035fd08a1a36 100644 --- a/pandas/_testing/__init__.py +++ b/pandas/_testing/__init__.py @@ -215,7 +215,7 @@ for unit in ["s", "ms", "us", "ns"] for tz in [None, "UTC", "US/Pacific", "US/Eastern"] ] - # TIMEDELTA_PYARROW_DTYPES = [pa.duration(unit) for unit in ["s", "ms", "us", "ns"]] + TIMEDELTA_PYARROW_DTYPES = [pa.duration(unit) for unit in ["s", "ms", "us", "ns"]] BOOL_PYARROW_DTYPES = [pa.bool_()] @@ -227,7 +227,7 @@ + TIME_PYARROW_DTYPES + DATE_PYARROW_DTYPES + DATETIME_PYARROW_DTYPES - # + TIMEDELTA_PYARROW_DTYPES + + TIMEDELTA_PYARROW_DTYPES + BOOL_PYARROW_DTYPES ) diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py index f8752e1c16c38..aeb9bc7a31674 100644 --- a/pandas/core/dtypes/common.py +++ b/pandas/core/dtypes/common.py @@ -382,10 +382,9 @@ def is_datetime64tz_dtype(arr_or_dtype) -> bool: >>> is_datetime64tz_dtype(s) True """ - if isinstance(arr_or_dtype, DatetimeTZDtype): + if isinstance(arr_or_dtype, ExtensionDtype): # GH#33400 fastpath for dtype object - # GH 34986 - return True + return arr_or_dtype.kind == "M" if arr_or_dtype is None: return False diff --git a/pandas/tests/dtypes/test_common.py b/pandas/tests/dtypes/test_common.py index 61c2b58388cc2..984655c68d56b 100644 --- a/pandas/tests/dtypes/test_common.py +++ b/pandas/tests/dtypes/test_common.py @@ -13,7 +13,6 @@ CategoricalDtype, CategoricalDtypeType, DatetimeTZDtype, - ExtensionDtype, IntervalDtype, PeriodDtype, ) @@ -240,18 +239,6 @@ def test_is_datetime64tz_dtype(): assert com.is_datetime64tz_dtype(pd.DatetimeIndex(["2000"], tz="US/Eastern")) -def test_custom_ea_kind_M_not_datetime64tz(): - # GH 34986 - class NotTZDtype(ExtensionDtype): - @property - def kind(self) -> str: - return "M" - - not_tz_dtype = NotTZDtype() - assert not com.is_datetime64tz_dtype(not_tz_dtype) - assert not com.needs_i8_conversion(not_tz_dtype) - - def test_is_timedelta64_dtype(): assert not com.is_timedelta64_dtype(object) assert not com.is_timedelta64_dtype(None) diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index e7d02b993cceb..314ebb1606c7c 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -1095,13 +1095,7 @@ def test_value_counts_with_normalize(self, data, request): ) ) with tm.maybe_produces_warning( - PerformanceWarning, - pa_version_under7p0 - and not ( - pa.types.is_date(pa_dtype) - or (pa.types.is_timestamp(pa_dtype) and pa_dtype.tz is None) - ), - check_stacklevel=False, + PerformanceWarning, pa_version_under7p0, check_stacklevel=False ): super().test_value_counts_with_normalize(data) @@ -1759,7 +1753,7 @@ def test_mode(data_for_grouping, dropna, take_idx, exp_idx, request): def test_repr_from_arrow_array(data, frame_or_series): - # GH 34986 & 48238 + # GH 48238 pa_array = pa.array([data[0], None]) result = frame_or_series(pa_array, dtype=ArrowDtype(pa_array.type)) repr(result) diff --git a/pyproject.toml b/pyproject.toml index 89afdcfd32d22..97a0d30092804 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -34,7 +34,7 @@ exclude = ''' [tool.pytest.ini_options] # sync minversion with pyproject.toml & install.rst minversion = "6.0" -addopts = "--strict-data-files --strict-markers --strict-config --capture=no --durations=30 --junitxml=test-data.xml" +addopts = "--strict-data-files --strict-markers --strict-config --capture=no --durations=30 --junitxml=test-data.xml --color=yes" xfail_strict = true testpaths = "pandas" doctest_optionflags = [ From 8ef9f46249b80e14c0542ccbf8a5bcc186934743 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Wed, 14 Sep 2022 11:59:43 -0700 Subject: [PATCH 18/18] Validate it's pa.duration again --- pandas/_testing/__init__.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/_testing/__init__.py b/pandas/_testing/__init__.py index 1035fd08a1a36..3f60b39980f36 100644 --- a/pandas/_testing/__init__.py +++ b/pandas/_testing/__init__.py @@ -215,7 +215,7 @@ for unit in ["s", "ms", "us", "ns"] for tz in [None, "UTC", "US/Pacific", "US/Eastern"] ] - TIMEDELTA_PYARROW_DTYPES = [pa.duration(unit) for unit in ["s", "ms", "us", "ns"]] + # TIMEDELTA_PYARROW_DTYPES = [pa.duration(unit) for unit in ["s", "ms", "us", "ns"]] BOOL_PYARROW_DTYPES = [pa.bool_()] @@ -227,7 +227,7 @@ + TIME_PYARROW_DTYPES + DATE_PYARROW_DTYPES + DATETIME_PYARROW_DTYPES - + TIMEDELTA_PYARROW_DTYPES + # + TIMEDELTA_PYARROW_DTYPES + BOOL_PYARROW_DTYPES )