Skip to content

TST: Filter/test pyarrow PerformanceWarnings #48093

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 7 commits into from
Aug 19, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
39 changes: 33 additions & 6 deletions pandas/tests/base/test_value_counts.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,9 @@
import numpy as np
import pytest

from pandas.compat import pa_version_under7p0
from pandas.errors import PerformanceWarning

import pandas as pd
from pandas import (
DatetimeIndex,
Expand Down Expand Up @@ -36,8 +39,16 @@ def test_value_counts(index_or_series_obj):
# TODO(GH#32514): Order of entries with the same count is inconsistent
# on CI (gh-32449)
if obj.duplicated().any():
result = result.sort_index()
expected = expected.sort_index()
with tm.maybe_produces_warning(
PerformanceWarning,
pa_version_under7p0 and getattr(obj.dtype, "storage", "") == "pyarrow",
):
result = result.sort_index()
with tm.maybe_produces_warning(
PerformanceWarning,
pa_version_under7p0 and getattr(obj.dtype, "storage", "") == "pyarrow",
):
expected = expected.sort_index()
tm.assert_series_equal(result, expected)


Expand Down Expand Up @@ -70,8 +81,16 @@ def test_value_counts_null(null_obj, index_or_series_obj):
if obj.duplicated().any():
# TODO(GH#32514):
# Order of entries with the same count is inconsistent on CI (gh-32449)
expected = expected.sort_index()
result = result.sort_index()
with tm.maybe_produces_warning(
PerformanceWarning,
pa_version_under7p0 and getattr(obj.dtype, "storage", "") == "pyarrow",
):
expected = expected.sort_index()
with tm.maybe_produces_warning(
PerformanceWarning,
pa_version_under7p0 and getattr(obj.dtype, "storage", "") == "pyarrow",
):
result = result.sort_index()

if not isinstance(result.dtype, np.dtype):
# i.e IntegerDtype
Expand All @@ -84,8 +103,16 @@ def test_value_counts_null(null_obj, index_or_series_obj):
if obj.duplicated().any():
# TODO(GH#32514):
# Order of entries with the same count is inconsistent on CI (gh-32449)
expected = expected.sort_index()
result = result.sort_index()
with tm.maybe_produces_warning(
PerformanceWarning,
pa_version_under7p0 and getattr(obj.dtype, "storage", "") == "pyarrow",
):
expected = expected.sort_index()
with tm.maybe_produces_warning(
PerformanceWarning,
pa_version_under7p0 and getattr(obj.dtype, "storage", "") == "pyarrow",
):
result = result.sort_index()
tm.assert_series_equal(result, expected)


Expand Down
141 changes: 138 additions & 3 deletions pandas/tests/extension/test_string.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,10 @@
import numpy as np
import pytest

from pandas.compat import pa_version_under6p0
from pandas.compat import (
pa_version_under6p0,
pa_version_under7p0,
)
from pandas.errors import PerformanceWarning

import pandas as pd
Expand Down Expand Up @@ -167,6 +170,22 @@ def test_reduce_series_numeric(self, data, all_numeric_reductions, skipna):


class TestMethods(base.BaseMethodsTests):
def test_argsort(self, data_for_sorting):
with tm.maybe_produces_warning(
PerformanceWarning,
pa_version_under7p0
and getattr(data_for_sorting.dtype, "storage", "") == "pyarrow",
):
super().test_argsort(data_for_sorting)

def test_argsort_missing(self, data_missing_for_sorting):
with tm.maybe_produces_warning(
PerformanceWarning,
pa_version_under7p0
and getattr(data_missing_for_sorting.dtype, "storage", "") == "pyarrow",
):
super().test_argsort_missing(data_missing_for_sorting)

def test_argmin_argmax(
self, data_for_sorting, data_missing_for_sorting, na_value, request
):
Expand Down Expand Up @@ -210,6 +229,89 @@ def test_argreduce_series(
data_missing_for_sorting, op_name, skipna, expected
)

@pytest.mark.parametrize("dropna", [True, False])
def test_value_counts(self, all_data, dropna, request):
all_data = all_data[:10]
if dropna:
other = all_data[~all_data.isna()]
else:
other = all_data
with tm.maybe_produces_warning(
PerformanceWarning,
pa_version_under7p0
and getattr(all_data.dtype, "storage", "") == "pyarrow"
and not (dropna and "data_missing" in request.node.nodeid),
):
result = pd.Series(all_data).value_counts(dropna=dropna).sort_index()
with tm.maybe_produces_warning(
PerformanceWarning,
pa_version_under7p0
and getattr(other.dtype, "storage", "") == "pyarrow"
and not (dropna and "data_missing" in request.node.nodeid),
):
expected = pd.Series(other).value_counts(dropna=dropna).sort_index()

self.assert_series_equal(result, expected)

@pytest.mark.filterwarnings("ignore:Falling back:pandas.errors.PerformanceWarning")
def test_value_counts_with_normalize(self, data):
super().test_value_counts_with_normalize(data)

def test_argsort_missing_array(self, data_missing_for_sorting):
with tm.maybe_produces_warning(
PerformanceWarning,
pa_version_under7p0
and getattr(data_missing_for_sorting.dtype, "storage", "") == "pyarrow",
):
super().test_argsort_missing(data_missing_for_sorting)

@pytest.mark.parametrize(
"na_position, expected",
[
("last", np.array([2, 0, 1], dtype=np.dtype("intp"))),
("first", np.array([1, 2, 0], dtype=np.dtype("intp"))),
],
)
def test_nargsort(self, data_missing_for_sorting, na_position, expected):
# GH 25439
with tm.maybe_produces_warning(
PerformanceWarning,
pa_version_under7p0
and getattr(data_missing_for_sorting.dtype, "storage", "") == "pyarrow",
):
super().test_nargsort(data_missing_for_sorting, na_position, expected)

@pytest.mark.parametrize("ascending", [True, False])
def test_sort_values(self, data_for_sorting, ascending, sort_by_key):
with tm.maybe_produces_warning(
PerformanceWarning,
pa_version_under7p0
and getattr(data_for_sorting.dtype, "storage", "") == "pyarrow",
):
super().test_sort_values(data_for_sorting, ascending, sort_by_key)

@pytest.mark.parametrize("ascending", [True, False])
def test_sort_values_missing(
self, data_missing_for_sorting, ascending, sort_by_key
):
with tm.maybe_produces_warning(
PerformanceWarning,
pa_version_under7p0
and getattr(data_missing_for_sorting.dtype, "storage", "") == "pyarrow",
):
super().test_sort_values_missing(
data_missing_for_sorting, ascending, sort_by_key
)

@pytest.mark.parametrize("ascending", [True, False])
def test_sort_values_frame(self, data_for_sorting, ascending):
with tm.maybe_produces_warning(
PerformanceWarning,
pa_version_under7p0
and getattr(data_for_sorting.dtype, "storage", "") == "pyarrow",
):
super().test_sort_values_frame(data_for_sorting, ascending)


class TestCasting(base.BaseCastingTests):
pass
Expand All @@ -236,8 +338,41 @@ class TestPrinting(base.BasePrintingTests):


class TestGroupBy(base.BaseGroupbyTests):
def test_groupby_extension_transform(self, data_for_grouping, request):
super().test_groupby_extension_transform(data_for_grouping)
@pytest.mark.parametrize("as_index", [True, False])
def test_groupby_extension_agg(self, as_index, data_for_grouping):
df = pd.DataFrame({"A": [1, 1, 2, 2, 3, 3, 1, 4], "B": data_for_grouping})
with tm.maybe_produces_warning(
PerformanceWarning,
pa_version_under7p0
and getattr(data_for_grouping.dtype, "storage", "") == "pyarrow",
):
result = df.groupby("B", as_index=as_index).A.mean()
with tm.maybe_produces_warning(
PerformanceWarning,
pa_version_under7p0
and getattr(data_for_grouping.dtype, "storage", "") == "pyarrow",
):
_, uniques = pd.factorize(data_for_grouping, sort=True)

if as_index:
index = pd.Index._with_infer(uniques, name="B")
expected = pd.Series([3.0, 1.0, 4.0], index=index, name="A")
self.assert_series_equal(result, expected)
else:
expected = pd.DataFrame({"B": uniques, "A": [3.0, 1.0, 4.0]})
self.assert_frame_equal(result, expected)

def test_groupby_extension_transform(self, data_for_grouping):
with tm.maybe_produces_warning(
PerformanceWarning,
pa_version_under7p0
and getattr(data_for_grouping.dtype, "storage", "") == "pyarrow",
):
super().test_groupby_extension_transform(data_for_grouping)

@pytest.mark.filterwarnings("ignore:Falling back:pandas.errors.PerformanceWarning")
def test_groupby_extension_apply(self, data_for_grouping, groupby_apply_op):
super().test_groupby_extension_apply(data_for_grouping, groupby_apply_op)


class Test2DCompat(base.Dim2CompatTests):
Expand Down
16 changes: 14 additions & 2 deletions pandas/tests/test_algos.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@
algos as libalgos,
hashtable as ht,
)
from pandas.compat import pa_version_under7p0
from pandas.errors import PerformanceWarning
import pandas.util._test_decorators as td

from pandas.core.dtypes.common import (
Expand Down Expand Up @@ -50,7 +52,13 @@ class TestFactorize:
@pytest.mark.parametrize("sort", [True, False])
def test_factorize(self, index_or_series_obj, sort):
obj = index_or_series_obj
result_codes, result_uniques = obj.factorize(sort=sort)
with tm.maybe_produces_warning(
PerformanceWarning,
sort
and pa_version_under7p0
and getattr(obj.dtype, "storage", "") == "pyarrow",
):
result_codes, result_uniques = obj.factorize(sort=sort)

constructor = Index
if isinstance(obj, MultiIndex):
Expand All @@ -64,7 +72,11 @@ def test_factorize(self, index_or_series_obj, sort):
expected_uniques = expected_uniques.astype(object)

if sort:
expected_uniques = expected_uniques.sort_values()
with tm.maybe_produces_warning(
PerformanceWarning,
pa_version_under7p0 and getattr(obj.dtype, "storage", "") == "pyarrow",
):
expected_uniques = expected_uniques.sort_values()

# construct an integer ndarray so that
# `expected_uniques.take(expected_codes)` is equal to `obj`
Expand Down