From 4eabcd6503f5df4230cb0faba46d17bbb2c2a39d Mon Sep 17 00:00:00 2001 From: parthiban Date: Mon, 13 Nov 2023 14:22:53 +0530 Subject: [PATCH 1/8] BUG: Set check_exact to true if dtype is int --- pandas/_testing/asserters.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/pandas/_testing/asserters.py b/pandas/_testing/asserters.py index 5f14d46be8e70..7bc07d7e17a95 100644 --- a/pandas/_testing/asserters.py +++ b/pandas/_testing/asserters.py @@ -921,6 +921,9 @@ def assert_series_equal( else: assert_attr_equal("dtype", left, right, obj=f"Attributes of {obj}") + if is_integer_dtype(left.dtype) and is_integer_dtype(right.dtype): + check_exact = True + if check_exact and is_numeric_dtype(left.dtype) and is_numeric_dtype(right.dtype): left_values = left._values right_values = right._values From bce308712c702c93aa6f3e15c3fbea4c42b96d61 Mon Sep 17 00:00:00 2001 From: parthiban Date: Mon, 13 Nov 2023 15:11:53 +0530 Subject: [PATCH 2/8] BUG: Add check to ignore dytype difference - If int dtype is different we are ignoring the difference - so added check to set check_exact to true only when dtype is same --- pandas/_testing/asserters.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/pandas/_testing/asserters.py b/pandas/_testing/asserters.py index 7bc07d7e17a95..17a625f6cef71 100644 --- a/pandas/_testing/asserters.py +++ b/pandas/_testing/asserters.py @@ -921,7 +921,11 @@ def assert_series_equal( else: assert_attr_equal("dtype", left, right, obj=f"Attributes of {obj}") - if is_integer_dtype(left.dtype) and is_integer_dtype(right.dtype): + if ( + is_integer_dtype(left.dtype) + and is_integer_dtype(right.dtype) + and left.dtype == right.dtype + ): check_exact = True if check_exact and is_numeric_dtype(left.dtype) and is_numeric_dtype(right.dtype): From 8fd5e611dd035f08cd1eb1b301d738f7fbb3abf2 Mon Sep 17 00:00:00 2001 From: parthiban Date: Mon, 13 Nov 2023 15:16:17 +0530 Subject: [PATCH 3/8] TST: Added test cases --- pandas/tests/util/test_assert_series_equal.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/pandas/tests/util/test_assert_series_equal.py b/pandas/tests/util/test_assert_series_equal.py index 12b5987cdb3de..3db960c2ba038 100644 --- a/pandas/tests/util/test_assert_series_equal.py +++ b/pandas/tests/util/test_assert_series_equal.py @@ -423,3 +423,11 @@ def test_check_dtype_false_different_reso(dtype): with pytest.raises(AssertionError, match="Series are different"): tm.assert_series_equal(ser_s, ser_ms, check_dtype=False) + + +def test_check_exact_true_for_int_dtype(): + # GH 55882 + left = Series([1577840521123000]) + right = Series([1577840521123543]) + with pytest.raises(AssertionError, match="Series are different"): + tm.assert_series_equal(left, right) From c3ecb846ba52f7c8df0b083c005092d15c3a0a78 Mon Sep 17 00:00:00 2001 From: parthiban Date: Mon, 13 Nov 2023 22:18:46 +0530 Subject: [PATCH 4/8] TST: Fix failing test cases --- pandas/_testing/asserters.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/_testing/asserters.py b/pandas/_testing/asserters.py index 17a625f6cef71..b934e0ba2aabb 100644 --- a/pandas/_testing/asserters.py +++ b/pandas/_testing/asserters.py @@ -924,7 +924,8 @@ def assert_series_equal( if ( is_integer_dtype(left.dtype) and is_integer_dtype(right.dtype) - and left.dtype == right.dtype + and isinstance(left._values, type(right._values)) + and isinstance(right._values, type(left._values)) ): check_exact = True From 6d74ec351fba3dd256406871b8b1bfbd0c2f2db8 Mon Sep 17 00:00:00 2001 From: parthiban Date: Sun, 19 Nov 2023 10:16:00 +0530 Subject: [PATCH 5/8] DOC: Update function documentation --- pandas/_testing/asserters.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/_testing/asserters.py b/pandas/_testing/asserters.py index b934e0ba2aabb..87b0a406ca851 100644 --- a/pandas/_testing/asserters.py +++ b/pandas/_testing/asserters.py @@ -828,6 +828,7 @@ def assert_series_equal( Whether to check the Series and Index names attribute. check_exact : bool, default False Whether to compare number exactly. + Note: Will be set to True if dtype is int. check_datetimelike_compat : bool, default False Compare datetime-like which is comparable ignoring dtype. check_categorical : bool, default True From c46b2d60469b5e2e77de7f841f0b92c80fc79a1d Mon Sep 17 00:00:00 2001 From: MarcoGorelli <33491632+MarcoGorelli@users.noreply.github.com> Date: Thu, 23 Nov 2023 14:27:42 +0000 Subject: [PATCH 6/8] check_exact only takes effect for floating dtypes --- doc/source/whatsnew/v2.2.0.rst | 2 +- pandas/_testing/asserters.py | 25 ++++++++--------- pandas/tests/extension/base/methods.py | 16 +++++++---- pandas/tests/series/test_constructors.py | 10 +++++-- pandas/tests/tools/test_to_datetime.py | 8 +++++- pandas/tests/util/test_assert_frame_equal.py | 18 ++++++++++--- pandas/tests/util/test_assert_series_equal.py | 27 +++++++++++++------ 7 files changed, 71 insertions(+), 35 deletions(-) diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index af14856fa3b6a..f35dff57c8566 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -315,7 +315,7 @@ See :ref:`install.dependencies` and :ref:`install.optional_dependencies` for mor Other API changes ^^^^^^^^^^^^^^^^^ -- +- ``check_exact`` now only takes effect for floating-point dtypes in :func:`testing.assert_frame_equal` and :func:`testing.assert_series_equal`. In particular, integer dtypes are always checked exactly (:issue:`55882`) - .. --------------------------------------------------------------------------- diff --git a/pandas/_testing/asserters.py b/pandas/_testing/asserters.py index de3d85a4793ca..5899bae742076 100644 --- a/pandas/_testing/asserters.py +++ b/pandas/_testing/asserters.py @@ -16,6 +16,7 @@ from pandas.core.dtypes.common import ( is_bool, + is_float_dtype, is_integer_dtype, is_number, is_numeric_dtype, @@ -713,7 +714,7 @@ def assert_extension_array_equal( index_values : Index | numpy.ndarray, default None Optional index (shared by both left and right), used in output. check_exact : bool, default False - Whether to compare number exactly. + Whether to compare number exactly. Only takes effect for float dtypes. rtol : float, default 1e-5 Relative tolerance. Only used when check_exact is False. atol : float, default 1e-8 @@ -782,7 +783,10 @@ def assert_extension_array_equal( left_valid = left[~left_na].to_numpy(dtype=object) right_valid = right[~right_na].to_numpy(dtype=object) - if check_exact: + if check_exact or ( + (is_numeric_dtype(left.dtype) and not is_float_dtype(left.dtype)) + or (is_numeric_dtype(right.dtype) and not is_float_dtype(right.dtype)) + ): assert_numpy_array_equal( left_valid, right_valid, obj=obj, index_values=index_values ) @@ -836,8 +840,7 @@ def assert_series_equal( check_names : bool, default True Whether to check the Series and Index names attribute. check_exact : bool, default False - Whether to compare number exactly. - Note: Will be set to True if dtype is int. + Whether to compare number exactly. Only takes effect for float dtypes. check_datetimelike_compat : bool, default False Compare datetime-like which is comparable ignoring dtype. check_categorical : bool, default True @@ -930,16 +933,10 @@ def assert_series_equal( pass else: assert_attr_equal("dtype", left, right, obj=f"Attributes of {obj}") - - if ( - is_integer_dtype(left.dtype) - and is_integer_dtype(right.dtype) - and isinstance(left._values, type(right._values)) - and isinstance(right._values, type(left._values)) + if check_exact or ( + (is_numeric_dtype(left.dtype) and not is_float_dtype(left.dtype)) + or (is_numeric_dtype(right.dtype) and not is_float_dtype(right.dtype)) ): - check_exact = True - - if check_exact and is_numeric_dtype(left.dtype) and is_numeric_dtype(right.dtype): left_values = left._values right_values = right._values # Only check exact if dtype is numeric @@ -1102,7 +1099,7 @@ def assert_frame_equal( Specify how to compare internal data. If False, compare by columns. If True, compare by blocks. check_exact : bool, default False - Whether to compare number exactly. + Whether to compare number exactly. Only takes effect for float dtypes. check_datetimelike_compat : bool, default False Compare datetime-like which is comparable ignoring dtype. check_categorical : bool, default True diff --git a/pandas/tests/extension/base/methods.py b/pandas/tests/extension/base/methods.py index b9407c7197f20..a0729c92e1a77 100644 --- a/pandas/tests/extension/base/methods.py +++ b/pandas/tests/extension/base/methods.py @@ -7,6 +7,7 @@ from pandas._typing import Dtype from pandas.core.dtypes.common import is_bool_dtype +from pandas.core.dtypes.dtypes import NumpyEADtype from pandas.core.dtypes.missing import na_value_for_dtype import pandas as pd @@ -331,7 +332,8 @@ def test_fillna_length_mismatch(self, data_missing): data_missing.fillna(data_missing.take([1])) # Subclasses can override if we expect e.g Sparse[bool], boolean, pyarrow[bool] - _combine_le_expected_dtype: Dtype = np.dtype(bool) + # _combine_le_expected_dtype: Dtype = np.dtype(bool) + _combine_le_expected_dtype: Dtype = NumpyEADtype("bool") def test_combine_le(self, data_repeated): # GH 20825 @@ -341,16 +343,20 @@ def test_combine_le(self, data_repeated): s2 = pd.Series(orig_data2) result = s1.combine(s2, lambda x1, x2: x1 <= x2) expected = pd.Series( - [a <= b for (a, b) in zip(list(orig_data1), list(orig_data2))], - dtype=self._combine_le_expected_dtype, + pd.array( + [a <= b for (a, b) in zip(list(orig_data1), list(orig_data2))], + dtype=self._combine_le_expected_dtype, + ) ) tm.assert_series_equal(result, expected) val = s1.iloc[0] result = s1.combine(val, lambda x1, x2: x1 <= x2) expected = pd.Series( - [a <= val for a in list(orig_data1)], - dtype=self._combine_le_expected_dtype, + pd.array( + [a <= val for a in list(orig_data1)], + dtype=self._combine_le_expected_dtype, + ) ) tm.assert_series_equal(result, expected) diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py index 65726eb8fcbb8..b8969072b91dc 100644 --- a/pandas/tests/series/test_constructors.py +++ b/pandas/tests/series/test_constructors.py @@ -572,7 +572,10 @@ def test_constructor_maskedarray(self): data[1] = 1 result = Series(data, index=index) expected = Series([0, 1, 2], index=index, dtype=int) - tm.assert_series_equal(result, expected) + with pytest.raises(AssertionError, match="Series classes are different"): + # TODO should this be raising at all? + # https://github.com/pandas-dev/pandas/issues/56131 + tm.assert_series_equal(result, expected) data = ma.masked_all((3,), dtype=bool) result = Series(data) @@ -589,7 +592,10 @@ def test_constructor_maskedarray(self): data[1] = True result = Series(data, index=index) expected = Series([True, True, False], index=index, dtype=bool) - tm.assert_series_equal(result, expected) + with pytest.raises(AssertionError, match="Series classes are different"): + # TODO should this be raising at all? + # https://github.com/pandas-dev/pandas/issues/56131 + tm.assert_series_equal(result, expected) data = ma.masked_all((3,), dtype="M8[ns]") result = Series(data) diff --git a/pandas/tests/tools/test_to_datetime.py b/pandas/tests/tools/test_to_datetime.py index 503032293dc81..ba8edda660860 100644 --- a/pandas/tests/tools/test_to_datetime.py +++ b/pandas/tests/tools/test_to_datetime.py @@ -2116,7 +2116,13 @@ def test_float_to_datetime_raise_near_bounds(self): expected = (should_succeed * oneday_in_ns).astype(np.int64) for error_mode in ["raise", "coerce", "ignore"]: result1 = to_datetime(should_succeed, unit="D", errors=error_mode) - tm.assert_almost_equal(result1.astype(np.int64), expected, rtol=1e-10) + # Cast to `np.float64` so that `rtol` and inexact checking kick in + # (`check_exact` doesn't take place for integer dtypes) + tm.assert_almost_equal( + result1.astype(np.int64).astype(np.float64), + expected.astype(np.float64), + rtol=1e-10, + ) # just out of bounds should_fail1 = Series([0, tsmax_in_days + 0.005], dtype=float) should_fail2 = Series([0, -tsmax_in_days - 0.005], dtype=float) diff --git a/pandas/tests/util/test_assert_frame_equal.py b/pandas/tests/util/test_assert_frame_equal.py index 2d3b47cd2e994..f5a8324e1896d 100644 --- a/pandas/tests/util/test_assert_frame_equal.py +++ b/pandas/tests/util/test_assert_frame_equal.py @@ -203,7 +203,10 @@ def test_assert_frame_equal_extension_dtype_mismatch(): "\\[right\\]: int[32|64]" ) - tm.assert_frame_equal(left, right, check_dtype=False) + # TODO: this shouldn't raise (or should raise a better error message) + # https://github.com/pandas-dev/pandas/issues/56131 + with pytest.raises(AssertionError, match="classes are different"): + tm.assert_frame_equal(left, right, check_dtype=False) with pytest.raises(AssertionError, match=msg): tm.assert_frame_equal(left, right, check_dtype=True) @@ -228,11 +231,18 @@ def test_assert_frame_equal_interval_dtype_mismatch(): tm.assert_frame_equal(left, right, check_dtype=True) -@pytest.mark.parametrize("right_dtype", ["Int32", "int64"]) -def test_assert_frame_equal_ignore_extension_dtype_mismatch(right_dtype): +def test_assert_frame_equal_ignore_extension_dtype_mismatch(): + # https://github.com/pandas-dev/pandas/issues/35715 + left = DataFrame({"a": [1, 2, 3]}, dtype="Int64") + right = DataFrame({"a": [1, 2, 3]}, dtype="Int32") + tm.assert_frame_equal(left, right, check_dtype=False) + + +@pytest.mark.xfail(reason="https://github.com/pandas-dev/pandas/issues/56131") +def test_assert_frame_equal_ignore_extension_dtype_mismatch_cross_class(): # https://github.com/pandas-dev/pandas/issues/35715 left = DataFrame({"a": [1, 2, 3]}, dtype="Int64") - right = DataFrame({"a": [1, 2, 3]}, dtype=right_dtype) + right = DataFrame({"a": [1, 2, 3]}, dtype="int64") tm.assert_frame_equal(left, right, check_dtype=False) diff --git a/pandas/tests/util/test_assert_series_equal.py b/pandas/tests/util/test_assert_series_equal.py index 3db960c2ba038..33cb32e58a27f 100644 --- a/pandas/tests/util/test_assert_series_equal.py +++ b/pandas/tests/util/test_assert_series_equal.py @@ -276,7 +276,10 @@ def test_assert_series_equal_extension_dtype_mismatch(): \\[left\\]: Int64 \\[right\\]: int[32|64]""" - tm.assert_series_equal(left, right, check_dtype=False) + # TODO: this shouldn't raise (or should raise a better error message) + # https://github.com/pandas-dev/pandas/issues/56131 + with pytest.raises(AssertionError, match="Series classes are different"): + tm.assert_series_equal(left, right, check_dtype=False) with pytest.raises(AssertionError, match=msg): tm.assert_series_equal(left, right, check_dtype=True) @@ -348,11 +351,18 @@ def test_series_equal_exact_for_nonnumeric(): tm.assert_series_equal(s3, s1, check_exact=True) -@pytest.mark.parametrize("right_dtype", ["Int32", "int64"]) -def test_assert_series_equal_ignore_extension_dtype_mismatch(right_dtype): +def test_assert_series_equal_ignore_extension_dtype_mismatch(): + # https://github.com/pandas-dev/pandas/issues/35715 + left = Series([1, 2, 3], dtype="Int64") + right = Series([1, 2, 3], dtype="Int32") + tm.assert_series_equal(left, right, check_dtype=False) + + +@pytest.mark.xfail(reason="https://github.com/pandas-dev/pandas/issues/56131") +def test_assert_series_equal_ignore_extension_dtype_mismatch_cross_class(): # https://github.com/pandas-dev/pandas/issues/35715 left = Series([1, 2, 3], dtype="Int64") - right = Series([1, 2, 3], dtype=right_dtype) + right = Series([1, 2, 3], dtype="int64") tm.assert_series_equal(left, right, check_dtype=False) @@ -425,9 +435,10 @@ def test_check_dtype_false_different_reso(dtype): tm.assert_series_equal(ser_s, ser_ms, check_dtype=False) -def test_check_exact_true_for_int_dtype(): - # GH 55882 - left = Series([1577840521123000]) - right = Series([1577840521123543]) +@pytest.mark.parametrize("dtype", ["Int64", "int64"]) +def test_large_unequal_ints(dtype): + # https://github.com/pandas-dev/pandas/issues/55882 + left = Series([1577840521123000], dtype=dtype) + right = Series([1577840521123543], dtype=dtype) with pytest.raises(AssertionError, match="Series are different"): tm.assert_series_equal(left, right) From a4eabea1b721e1f249a4ad7cc0ef577416c95223 Mon Sep 17 00:00:00 2001 From: MarcoGorelli <33491632+MarcoGorelli@users.noreply.github.com> Date: Thu, 23 Nov 2023 16:22:38 +0000 Subject: [PATCH 7/8] xfail failing test --- pandas/tests/io/parser/dtypes/test_dtypes_basic.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/pandas/tests/io/parser/dtypes/test_dtypes_basic.py b/pandas/tests/io/parser/dtypes/test_dtypes_basic.py index 32b4b1dedc3cb..0deafda750904 100644 --- a/pandas/tests/io/parser/dtypes/test_dtypes_basic.py +++ b/pandas/tests/io/parser/dtypes/test_dtypes_basic.py @@ -531,6 +531,9 @@ def test_dtype_backend_pyarrow(all_parsers, request): tm.assert_frame_equal(result, expected) +# pyarrow engine failing: +# https://github.com/pandas-dev/pandas/issues/56136 +@pytest.mark.usefixtures("pyarrow_xfail") def test_ea_int_avoid_overflow(all_parsers): # GH#32134 parser = all_parsers From 00b7c91fa0da627570b09e172164168e4809fc86 Mon Sep 17 00:00:00 2001 From: Marco Edward Gorelli Date: Mon, 27 Nov 2023 17:56:33 +0000 Subject: [PATCH 8/8] Update pandas/tests/extension/base/methods.py Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> --- pandas/tests/extension/base/methods.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/tests/extension/base/methods.py b/pandas/tests/extension/base/methods.py index a0729c92e1a77..a354e5767f37f 100644 --- a/pandas/tests/extension/base/methods.py +++ b/pandas/tests/extension/base/methods.py @@ -332,7 +332,6 @@ def test_fillna_length_mismatch(self, data_missing): data_missing.fillna(data_missing.take([1])) # Subclasses can override if we expect e.g Sparse[bool], boolean, pyarrow[bool] - # _combine_le_expected_dtype: Dtype = np.dtype(bool) _combine_le_expected_dtype: Dtype = NumpyEADtype("bool") def test_combine_le(self, data_repeated):