From db0dfa4d342fe91a37bd24faba3c046cad44dda5 Mon Sep 17 00:00:00 2001 From: Sanjith Chockan Date: Wed, 19 Jul 2023 02:19:45 +0000 Subject: [PATCH 01/10] use decimal to prevent precision loss --- pandas/core/computation/pytables.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/pandas/core/computation/pytables.py b/pandas/core/computation/pytables.py index 5175884bca210..70d96ee2f2776 100644 --- a/pandas/core/computation/pytables.py +++ b/pandas/core/computation/pytables.py @@ -233,7 +233,10 @@ def stringify(value): result = metadata.searchsorted(v, side="left") return TermValue(result, result, "integer") elif kind == "integer": - v = int(float(v)) + from decimal import Decimal + + v_dec = Decimal(v) + v = int(v_dec.to_integral_exact(rounding="ROUND_HALF_EVEN")) return TermValue(v, v, kind) elif kind == "float": v = float(v) From ef56f12c2c9c105f79686681f720a44f85800134 Mon Sep 17 00:00:00 2001 From: Sanjith Chockan Date: Wed, 19 Jul 2023 02:57:25 +0000 Subject: [PATCH 02/10] added unit test --- pandas/tests/io/pytables/test_select.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/pandas/tests/io/pytables/test_select.py b/pandas/tests/io/pytables/test_select.py index f14a3ad7c5e10..5c6dbae8f0aed 100644 --- a/pandas/tests/io/pytables/test_select.py +++ b/pandas/tests/io/pytables/test_select.py @@ -943,3 +943,20 @@ def test_select_empty_where(tmp_path, where): store.put("df", df, "t") result = read_hdf(store, "df", where=where) tm.assert_frame_equal(result, df) + + +def test_select_large_integer(tmp_path): + path = tmp_path / "large_int.h5" + s = HDFStore(path) + df = DataFrame( + zip( + ["a", "b", "c", "d"], + [-9223372036854775801, -9223372036854775802, -9223372036854775803, 123], + ), + columns=["x", "y"], + ) + s.append("data", df, data_columns=True, index=False) + assert ( + s.select("data", where="y==-9223372036854775801").get("y").get(0) + == -9223372036854775801 + ) From 721c8fa613a2d0252f38b5db9c4da9aa4611313a Mon Sep 17 00:00:00 2001 From: Sanjith Chockan Date: Wed, 19 Jul 2023 06:39:39 +0000 Subject: [PATCH 03/10] raise ValueError if given value cannot be converted to double --- pandas/core/computation/pytables.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/pandas/core/computation/pytables.py b/pandas/core/computation/pytables.py index 70d96ee2f2776..b26e59de7dc0f 100644 --- a/pandas/core/computation/pytables.py +++ b/pandas/core/computation/pytables.py @@ -233,10 +233,16 @@ def stringify(value): result = metadata.searchsorted(v, side="left") return TermValue(result, result, "integer") elif kind == "integer": - from decimal import Decimal + from decimal import ( + Decimal, + InvalidOperation, + ) - v_dec = Decimal(v) - v = int(v_dec.to_integral_exact(rounding="ROUND_HALF_EVEN")) + try: + v_dec = Decimal(v) + v = int(v_dec.to_integral_exact(rounding="ROUND_HALF_EVEN")) + except InvalidOperation: + raise ValueError(f"could not convert {type(v)} to {kind}") return TermValue(v, v, kind) elif kind == "float": v = float(v) From b9e09004124a9fd324970ba81a8ba2d201d7df4b Mon Sep 17 00:00:00 2001 From: Sanjith Chockan Date: Wed, 19 Jul 2023 06:46:38 +0000 Subject: [PATCH 04/10] added bugfix in whatsnew --- doc/source/whatsnew/v2.1.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst index 6c2784bc93b0c..d7e1cb45bf269 100644 --- a/doc/source/whatsnew/v2.1.0.rst +++ b/doc/source/whatsnew/v2.1.0.rst @@ -478,10 +478,10 @@ Conversion - Bug in :func:`DataFrame.style.to_latex` and :func:`DataFrame.style.to_html` if the DataFrame contains integers with more digits than can be represented by floating point double precision (:issue:`52272`) - Bug in :func:`array` when given a ``datetime64`` or ``timedelta64`` dtype with unit of "s", "us", or "ms" returning :class:`PandasArray` instead of :class:`DatetimeArray` or :class:`TimedeltaArray` (:issue:`52859`) - Bug in :meth:`ArrowDtype.numpy_dtype` returning nanosecond units for non-nanosecond ``pyarrow.timestamp`` and ``pyarrow.duration`` types (:issue:`51800`) +- Bug in :meth:`BinOp.convert_value` loses precision when converting int to float if integer in DataFrame stored in HDFS file format is large (:issue:`54186`) - Bug in :meth:`DataFrame.__repr__` incorrectly raising a ``TypeError`` when the dtype of a column is ``np.record`` (:issue:`48526`) - Bug in :meth:`DataFrame.info` raising ``ValueError`` when ``use_numba`` is set (:issue:`51922`) - Bug in :meth:`DataFrame.insert` raising ``TypeError`` if ``loc`` is ``np.int64`` (:issue:`53193`) -- Strings ^^^^^^^ From 018a2eccf3c869c253422684430353326c9cfbe5 Mon Sep 17 00:00:00 2001 From: Sanjith Chockan Date: Wed, 19 Jul 2023 14:26:30 +0000 Subject: [PATCH 05/10] updated unit test --- pandas/core/computation/pytables.py | 2 +- pandas/tests/io/pytables/test_select.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/computation/pytables.py b/pandas/core/computation/pytables.py index b26e59de7dc0f..e836ea20ede83 100644 --- a/pandas/core/computation/pytables.py +++ b/pandas/core/computation/pytables.py @@ -242,7 +242,7 @@ def stringify(value): v_dec = Decimal(v) v = int(v_dec.to_integral_exact(rounding="ROUND_HALF_EVEN")) except InvalidOperation: - raise ValueError(f"could not convert {type(v)} to {kind}") + raise ValueError("could not convert string to ") return TermValue(v, v, kind) elif kind == "float": v = float(v) diff --git a/pandas/tests/io/pytables/test_select.py b/pandas/tests/io/pytables/test_select.py index 5c6dbae8f0aed..61f3d26a7c7dd 100644 --- a/pandas/tests/io/pytables/test_select.py +++ b/pandas/tests/io/pytables/test_select.py @@ -914,7 +914,7 @@ def test_query_compare_column_type(setup_path): if col == "real_date": msg = 'Given date string "a" not likely a datetime' else: - msg = "could not convert string to " + msg = "could not convert string to" with pytest.raises(ValueError, match=msg): store.select("test", where=query) From 8bb016ccec6619230a5f571ece4729446d2adc4c Mon Sep 17 00:00:00 2001 From: Sanjith Chockan Date: Wed, 19 Jul 2023 17:26:43 +0000 Subject: [PATCH 06/10] added context manager and cleaned up assert for readability --- pandas/tests/io/pytables/test_select.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/pandas/tests/io/pytables/test_select.py b/pandas/tests/io/pytables/test_select.py index 61f3d26a7c7dd..8d9e0b9f5ffec 100644 --- a/pandas/tests/io/pytables/test_select.py +++ b/pandas/tests/io/pytables/test_select.py @@ -947,7 +947,7 @@ def test_select_empty_where(tmp_path, where): def test_select_large_integer(tmp_path): path = tmp_path / "large_int.h5" - s = HDFStore(path) + df = DataFrame( zip( ["a", "b", "c", "d"], @@ -955,8 +955,10 @@ def test_select_large_integer(tmp_path): ), columns=["x", "y"], ) - s.append("data", df, data_columns=True, index=False) - assert ( - s.select("data", where="y==-9223372036854775801").get("y").get(0) - == -9223372036854775801 - ) + result = None + with HDFStore(path) as s: + s.append("data", df, data_columns=True, index=False) + result = s.select("data", where="y==-9223372036854775801").get("y").get(0) + expected = df["y"][0] + + assert expected == result From 2966e0695b8bbcd7181b42a6340fcb85e8a7aa2f Mon Sep 17 00:00:00 2001 From: Sanjith Chockan Date: Wed, 19 Jul 2023 17:30:39 +0000 Subject: [PATCH 07/10] updated whatsnew --- doc/source/whatsnew/v2.1.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst index d7e1cb45bf269..b0c2b845073f6 100644 --- a/doc/source/whatsnew/v2.1.0.rst +++ b/doc/source/whatsnew/v2.1.0.rst @@ -478,10 +478,10 @@ Conversion - Bug in :func:`DataFrame.style.to_latex` and :func:`DataFrame.style.to_html` if the DataFrame contains integers with more digits than can be represented by floating point double precision (:issue:`52272`) - Bug in :func:`array` when given a ``datetime64`` or ``timedelta64`` dtype with unit of "s", "us", or "ms" returning :class:`PandasArray` instead of :class:`DatetimeArray` or :class:`TimedeltaArray` (:issue:`52859`) - Bug in :meth:`ArrowDtype.numpy_dtype` returning nanosecond units for non-nanosecond ``pyarrow.timestamp`` and ``pyarrow.duration`` types (:issue:`51800`) -- Bug in :meth:`BinOp.convert_value` loses precision when converting int to float if integer in DataFrame stored in HDFS file format is large (:issue:`54186`) - Bug in :meth:`DataFrame.__repr__` incorrectly raising a ``TypeError`` when the dtype of a column is ``np.record`` (:issue:`48526`) - Bug in :meth:`DataFrame.info` raising ``ValueError`` when ``use_numba`` is set (:issue:`51922`) - Bug in :meth:`DataFrame.insert` raising ``TypeError`` if ``loc`` is ``np.int64`` (:issue:`53193`) +- Bug in :meth:`HDFStore.select` loses precision of large int when stored and retrieved (:issue:`54186`) Strings ^^^^^^^ From 3264be4b0409f844f0d8e3e7e24a4bbc6fba7a24 Mon Sep 17 00:00:00 2001 From: Sanjith Chockan Date: Wed, 19 Jul 2023 18:09:50 +0000 Subject: [PATCH 08/10] moved imports to top --- pandas/core/computation/pytables.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/pandas/core/computation/pytables.py b/pandas/core/computation/pytables.py index e836ea20ede83..d643d8ff4b7b9 100644 --- a/pandas/core/computation/pytables.py +++ b/pandas/core/computation/pytables.py @@ -2,6 +2,10 @@ from __future__ import annotations import ast +from decimal import ( + Decimal, + InvalidOperation, +) from functools import partial from typing import ( TYPE_CHECKING, @@ -233,11 +237,6 @@ def stringify(value): result = metadata.searchsorted(v, side="left") return TermValue(result, result, "integer") elif kind == "integer": - from decimal import ( - Decimal, - InvalidOperation, - ) - try: v_dec = Decimal(v) v = int(v_dec.to_integral_exact(rounding="ROUND_HALF_EVEN")) From 96513284899b29804be5c741b6ec1c3dff0d65f0 Mon Sep 17 00:00:00 2001 From: Sanjith Chockan Date: Wed, 19 Jul 2023 18:23:58 +0000 Subject: [PATCH 09/10] reverted to float's ValueError raise --- pandas/core/computation/pytables.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/pandas/core/computation/pytables.py b/pandas/core/computation/pytables.py index d643d8ff4b7b9..75e1e82ffa8c8 100644 --- a/pandas/core/computation/pytables.py +++ b/pandas/core/computation/pytables.py @@ -239,9 +239,10 @@ def stringify(value): elif kind == "integer": try: v_dec = Decimal(v) - v = int(v_dec.to_integral_exact(rounding="ROUND_HALF_EVEN")) except InvalidOperation: - raise ValueError("could not convert string to ") + float(v) + else: + v = int(v_dec.to_integral_exact(rounding="ROUND_HALF_EVEN")) return TermValue(v, v, kind) elif kind == "float": v = float(v) From 2514f1eee301945ae9ca4deccb2e00c4fff4bf9f Mon Sep 17 00:00:00 2001 From: Sanjith Chockan Date: Wed, 19 Jul 2023 19:17:00 +0000 Subject: [PATCH 10/10] added comments for float(v) --- pandas/core/computation/pytables.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pandas/core/computation/pytables.py b/pandas/core/computation/pytables.py index 75e1e82ffa8c8..433421d35af55 100644 --- a/pandas/core/computation/pytables.py +++ b/pandas/core/computation/pytables.py @@ -240,6 +240,8 @@ def stringify(value): try: v_dec = Decimal(v) except InvalidOperation: + # GH 54186 + # convert v to float to raise float's ValueError float(v) else: v = int(v_dec.to_integral_exact(rounding="ROUND_HALF_EVEN"))