From 6dd02166328361eb636e6f4e7dab87f31fa7d9f1 Mon Sep 17 00:00:00 2001 From: Jake Thomas Trevallion Date: Sun, 2 Feb 2025 09:35:22 +0000 Subject: [PATCH 1/3] Add clearer error messages for datatype mismatch in HDFStore.append. Raise ValueError when nan_rep too large for pytable column. Add and modify applicable test code. --- pandas/io/pytables.py | 11 +++++++++++ pandas/tests/io/pytables/test_append.py | 19 ++++++++++--------- 2 files changed, 21 insertions(+), 9 deletions(-) diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index e18db2e53113f..e387691d2ff64 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -3524,6 +3524,14 @@ def validate(self, other) -> None: # Value of type "Optional[Any]" is not indexable [index] oax = ov[i] # type: ignore[index] if sax != oax: + ## Raise clearer error if mismatching type on values_axes + if c == "values_axes" and sax.kind != oax.kind: + raise TypeError( + f"Cannot serialize the column [{oax.values[0]}] " + f"because its data contents are not [{oax.kind}] " + f"but [{sax.kind}] object dtype" + ) + # Fallback if other source of difference raise ValueError( f"invalid combination of [{c}] on appending data " f"[{sax}] vs current table [{oax}]" @@ -5136,6 +5144,9 @@ def _maybe_convert_for_string_atom( data = bvalues.copy() data[mask] = nan_rep + if existing_col and mask.any() and len(nan_rep) > existing_col.itemsize: + raise ValueError("NaN representation is too large for existing column size") + # see if we have a valid string type inferred_type = lib.infer_dtype(data, skipna=False) if inferred_type != "string": diff --git a/pandas/tests/io/pytables/test_append.py b/pandas/tests/io/pytables/test_append.py index 47658c0eb9012..eb0dd7cd9c142 100644 --- a/pandas/tests/io/pytables/test_append.py +++ b/pandas/tests/io/pytables/test_append.py @@ -421,6 +421,14 @@ def check_col(key, name, size): with pytest.raises(ValueError, match=msg): store.append("df_new", df_new) + # bigger NaN representation on next append + df_new = DataFrame([[124, "a"], [346, "b"]]) + store.append("df_new2", df_new) + df_new = DataFrame([[124, None], [346, "b"]]) + msg = "NaN representation is too large for existing column size" + with pytest.raises(ValueError, match=msg): + store.append("df_new2", df_new) + # min_itemsize on Series index (GH 11412) df = DataFrame( { @@ -822,15 +830,8 @@ def test_append_raise(setup_path): df["foo"] = Timestamp("20130101") store.append("df", df) df["foo"] = "bar" - msg = re.escape( - "invalid combination of [values_axes] on appending data " - "[name->values_block_1,cname->values_block_1," - "dtype->bytes24,kind->string,shape->(1, 30)] " - "vs current table " - "[name->values_block_1,cname->values_block_1," - "dtype->datetime64[s],kind->datetime64[s],shape->None]" - ) - with pytest.raises(ValueError, match=msg): + msg = re.escape("Cannot serialize the column [foo] but [string] object dtype") + with pytest.raises(TypeError, match=msg): store.append("df", df) From 500ab5a9a2a6d386504345da73636b42a17485bc Mon Sep 17 00:00:00 2001 From: Jake Thomas Trevallion Date: Sun, 2 Feb 2025 10:00:09 +0000 Subject: [PATCH 2/3] Fix missed tests and correct mistake in error message. --- pandas/io/pytables.py | 4 ++-- pandas/tests/io/pytables/test_append.py | 6 +++++- pandas/tests/io/pytables/test_round_trip.py | 11 ++++------- 3 files changed, 11 insertions(+), 10 deletions(-) diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index e387691d2ff64..56c09649587cc 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -3528,8 +3528,8 @@ def validate(self, other) -> None: if c == "values_axes" and sax.kind != oax.kind: raise TypeError( f"Cannot serialize the column [{oax.values[0]}] " - f"because its data contents are not [{oax.kind}] " - f"but [{sax.kind}] object dtype" + f"because its data contents are not [{sax.kind}] " + f"but [{oax.kind}] object dtype" ) # Fallback if other source of difference raise ValueError( diff --git a/pandas/tests/io/pytables/test_append.py b/pandas/tests/io/pytables/test_append.py index eb0dd7cd9c142..9940adb39a8d3 100644 --- a/pandas/tests/io/pytables/test_append.py +++ b/pandas/tests/io/pytables/test_append.py @@ -830,7 +830,11 @@ def test_append_raise(setup_path): df["foo"] = Timestamp("20130101") store.append("df", df) df["foo"] = "bar" - msg = re.escape("Cannot serialize the column [foo] but [string] object dtype") + msg = re.escape( + "Cannot serialize the column [foo] " + "because its data contents are not [string] " + "but [datetime64[s]] object dtype" + ) with pytest.raises(TypeError, match=msg): store.append("df", df) diff --git a/pandas/tests/io/pytables/test_round_trip.py b/pandas/tests/io/pytables/test_round_trip.py index 6b98a720e4299..bbb37eb64328b 100644 --- a/pandas/tests/io/pytables/test_round_trip.py +++ b/pandas/tests/io/pytables/test_round_trip.py @@ -213,14 +213,11 @@ def test_table_values_dtypes_roundtrip(setup_path): # incompatible dtype msg = re.escape( - "invalid combination of [values_axes] on appending data " - "[name->values_block_0,cname->values_block_0," - "dtype->float64,kind->float,shape->(1, 3)] vs " - "current table [name->values_block_0," - "cname->values_block_0,dtype->int64,kind->integer," - "shape->None]" + "Cannot serialize the column [a] " + "because its data contents are not [float] " + "but [integer] object dtype" ) - with pytest.raises(ValueError, match=msg): + with pytest.raises(TypeError, match=msg): store.append("df_i8", df1) # check creation/storage/retrieval of float32 (a bit hacky to From 4627462965e849574f0b6b39d8909cd5b6d3f714 Mon Sep 17 00:00:00 2001 From: Jake Thomas Trevallion Date: Mon, 3 Feb 2025 23:55:02 +0000 Subject: [PATCH 3/3] Remove excess comments. Reverse error type change to avoid api changes. Move nan_rep tests into separate function. --- pandas/io/pytables.py | 4 +-- pandas/tests/io/pytables/test_append.py | 36 +++++++++++++++------ pandas/tests/io/pytables/test_round_trip.py | 2 +- 3 files changed, 29 insertions(+), 13 deletions(-) diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 56c09649587cc..b4c78b063c180 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -3524,14 +3524,12 @@ def validate(self, other) -> None: # Value of type "Optional[Any]" is not indexable [index] oax = ov[i] # type: ignore[index] if sax != oax: - ## Raise clearer error if mismatching type on values_axes if c == "values_axes" and sax.kind != oax.kind: - raise TypeError( + raise ValueError( f"Cannot serialize the column [{oax.values[0]}] " f"because its data contents are not [{sax.kind}] " f"but [{oax.kind}] object dtype" ) - # Fallback if other source of difference raise ValueError( f"invalid combination of [{c}] on appending data " f"[{sax}] vs current table [{oax}]" diff --git a/pandas/tests/io/pytables/test_append.py b/pandas/tests/io/pytables/test_append.py index 9940adb39a8d3..04241a78bff5f 100644 --- a/pandas/tests/io/pytables/test_append.py +++ b/pandas/tests/io/pytables/test_append.py @@ -421,14 +421,6 @@ def check_col(key, name, size): with pytest.raises(ValueError, match=msg): store.append("df_new", df_new) - # bigger NaN representation on next append - df_new = DataFrame([[124, "a"], [346, "b"]]) - store.append("df_new2", df_new) - df_new = DataFrame([[124, None], [346, "b"]]) - msg = "NaN representation is too large for existing column size" - with pytest.raises(ValueError, match=msg): - store.append("df_new2", df_new) - # min_itemsize on Series index (GH 11412) df = DataFrame( { @@ -835,7 +827,7 @@ def test_append_raise(setup_path): "because its data contents are not [string] " "but [datetime64[s]] object dtype" ) - with pytest.raises(TypeError, match=msg): + with pytest.raises(ValueError, match=msg): store.append("df", df) @@ -1002,3 +994,29 @@ def test_append_to_multiple_min_itemsize(setup_path): ) result = store.select_as_multiple(["index", "nums", "strs"]) tm.assert_frame_equal(result, expected, check_index_type=True) + + +def test_append_string_nan_rep(setup_path): + # GH 16300 + df = DataFrame({"A": "a", "B": "foo"}, index=np.arange(10)) + df_nan = df.copy() + df_nan.loc[0:4, :] = np.nan + msg = "NaN representation is too large for existing column size" + + with ensure_clean_store(setup_path) as store: + # string column too small + store.append("sa", df["A"]) + with pytest.raises(ValueError, match=msg): + store.append("sa", df_nan["A"]) + + # nan_rep too big + store.append("sb", df["B"], nan_rep="bars") + with pytest.raises(ValueError, match=msg): + store.append("sb", df_nan["B"]) + + # smaller modified nan_rep + store.append("sc", df["A"], nan_rep="n") + store.append("sc", df_nan["A"]) + result = store["sc"] + expected = concat([df["A"], df_nan["A"]]) + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/io/pytables/test_round_trip.py b/pandas/tests/io/pytables/test_round_trip.py index bbb37eb64328b..875a792467828 100644 --- a/pandas/tests/io/pytables/test_round_trip.py +++ b/pandas/tests/io/pytables/test_round_trip.py @@ -217,7 +217,7 @@ def test_table_values_dtypes_roundtrip(setup_path): "because its data contents are not [float] " "but [integer] object dtype" ) - with pytest.raises(TypeError, match=msg): + with pytest.raises(ValueError, match=msg): store.append("df_i8", df1) # check creation/storage/retrieval of float32 (a bit hacky to