From 6dd02166328361eb636e6f4e7dab87f31fa7d9f1 Mon Sep 17 00:00:00 2001
From: Jake Thomas Trevallion <flamingood.media.jt@gmail.com>
Date: Sun, 2 Feb 2025 09:35:22 +0000
Subject: [PATCH 1/3] Add clearer error messages for datatype mismatch in
 HDFStore.append. Raise ValueError when nan_rep too large for pytable column.
 Add and modify applicable test code.

---
 pandas/io/pytables.py                   | 11 +++++++++++
 pandas/tests/io/pytables/test_append.py | 19 ++++++++++---------
 2 files changed, 21 insertions(+), 9 deletions(-)

diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py
index e18db2e53113f..e387691d2ff64 100644
--- a/pandas/io/pytables.py
+++ b/pandas/io/pytables.py
@@ -3524,6 +3524,14 @@ def validate(self, other) -> None:
                     # Value of type "Optional[Any]" is not indexable  [index]
                     oax = ov[i]  # type: ignore[index]
                     if sax != oax:
+                        ## Raise clearer error if mismatching type on values_axes
+                        if c == "values_axes" and sax.kind != oax.kind:
+                            raise TypeError(
+                                f"Cannot serialize the column [{oax.values[0]}] "
+                                f"because its data contents are not [{oax.kind}] "
+                                f"but [{sax.kind}] object dtype"
+                            )
+                        # Fallback if other source of difference
                         raise ValueError(
                             f"invalid combination of [{c}] on appending data "
                             f"[{sax}] vs current table [{oax}]"
@@ -5136,6 +5144,9 @@ def _maybe_convert_for_string_atom(
     data = bvalues.copy()
     data[mask] = nan_rep
 
+    if existing_col and mask.any() and len(nan_rep) > existing_col.itemsize:
+        raise ValueError("NaN representation is too large for existing column size")
+
     # see if we have a valid string type
     inferred_type = lib.infer_dtype(data, skipna=False)
     if inferred_type != "string":
diff --git a/pandas/tests/io/pytables/test_append.py b/pandas/tests/io/pytables/test_append.py
index 47658c0eb9012..eb0dd7cd9c142 100644
--- a/pandas/tests/io/pytables/test_append.py
+++ b/pandas/tests/io/pytables/test_append.py
@@ -421,6 +421,14 @@ def check_col(key, name, size):
         with pytest.raises(ValueError, match=msg):
             store.append("df_new", df_new)
 
+        # bigger NaN representation on next append
+        df_new = DataFrame([[124, "a"], [346, "b"]])
+        store.append("df_new2", df_new)
+        df_new = DataFrame([[124, None], [346, "b"]])
+        msg = "NaN representation is too large for existing column size"
+        with pytest.raises(ValueError, match=msg):
+            store.append("df_new2", df_new)
+
         # min_itemsize on Series index (GH 11412)
         df = DataFrame(
             {
@@ -822,15 +830,8 @@ def test_append_raise(setup_path):
         df["foo"] = Timestamp("20130101")
         store.append("df", df)
         df["foo"] = "bar"
-        msg = re.escape(
-            "invalid combination of [values_axes] on appending data "
-            "[name->values_block_1,cname->values_block_1,"
-            "dtype->bytes24,kind->string,shape->(1, 30)] "
-            "vs current table "
-            "[name->values_block_1,cname->values_block_1,"
-            "dtype->datetime64[s],kind->datetime64[s],shape->None]"
-        )
-        with pytest.raises(ValueError, match=msg):
+        msg = re.escape("Cannot serialize the column [foo] but [string] object dtype")
+        with pytest.raises(TypeError, match=msg):
             store.append("df", df)
 
 

From 500ab5a9a2a6d386504345da73636b42a17485bc Mon Sep 17 00:00:00 2001
From: Jake Thomas Trevallion <flamingood.media.jt@gmail.com>
Date: Sun, 2 Feb 2025 10:00:09 +0000
Subject: [PATCH 2/3] Fix missed tests and correct mistake in error message.

---
 pandas/io/pytables.py                       |  4 ++--
 pandas/tests/io/pytables/test_append.py     |  6 +++++-
 pandas/tests/io/pytables/test_round_trip.py | 11 ++++-------
 3 files changed, 11 insertions(+), 10 deletions(-)

diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py
index e387691d2ff64..56c09649587cc 100644
--- a/pandas/io/pytables.py
+++ b/pandas/io/pytables.py
@@ -3528,8 +3528,8 @@ def validate(self, other) -> None:
                         if c == "values_axes" and sax.kind != oax.kind:
                             raise TypeError(
                                 f"Cannot serialize the column [{oax.values[0]}] "
-                                f"because its data contents are not [{oax.kind}] "
-                                f"but [{sax.kind}] object dtype"
+                                f"because its data contents are not [{sax.kind}] "
+                                f"but [{oax.kind}] object dtype"
                             )
                         # Fallback if other source of difference
                         raise ValueError(
diff --git a/pandas/tests/io/pytables/test_append.py b/pandas/tests/io/pytables/test_append.py
index eb0dd7cd9c142..9940adb39a8d3 100644
--- a/pandas/tests/io/pytables/test_append.py
+++ b/pandas/tests/io/pytables/test_append.py
@@ -830,7 +830,11 @@ def test_append_raise(setup_path):
         df["foo"] = Timestamp("20130101")
         store.append("df", df)
         df["foo"] = "bar"
-        msg = re.escape("Cannot serialize the column [foo] but [string] object dtype")
+        msg = re.escape(
+            "Cannot serialize the column [foo] "
+            "because its data contents are not [string] "
+            "but [datetime64[s]] object dtype"
+        )
         with pytest.raises(TypeError, match=msg):
             store.append("df", df)
 
diff --git a/pandas/tests/io/pytables/test_round_trip.py b/pandas/tests/io/pytables/test_round_trip.py
index 6b98a720e4299..bbb37eb64328b 100644
--- a/pandas/tests/io/pytables/test_round_trip.py
+++ b/pandas/tests/io/pytables/test_round_trip.py
@@ -213,14 +213,11 @@ def test_table_values_dtypes_roundtrip(setup_path):
 
         # incompatible dtype
         msg = re.escape(
-            "invalid combination of [values_axes] on appending data "
-            "[name->values_block_0,cname->values_block_0,"
-            "dtype->float64,kind->float,shape->(1, 3)] vs "
-            "current table [name->values_block_0,"
-            "cname->values_block_0,dtype->int64,kind->integer,"
-            "shape->None]"
+            "Cannot serialize the column [a] "
+            "because its data contents are not [float] "
+            "but [integer] object dtype"
         )
-        with pytest.raises(ValueError, match=msg):
+        with pytest.raises(TypeError, match=msg):
             store.append("df_i8", df1)
 
         # check creation/storage/retrieval of float32 (a bit hacky to

From 4627462965e849574f0b6b39d8909cd5b6d3f714 Mon Sep 17 00:00:00 2001
From: Jake Thomas Trevallion <flamingood.media.jt@gmail.com>
Date: Mon, 3 Feb 2025 23:55:02 +0000
Subject: [PATCH 3/3] Remove excess comments. Reverse error type change to
 avoid api changes. Move nan_rep tests into separate function.

---
 pandas/io/pytables.py                       |  4 +--
 pandas/tests/io/pytables/test_append.py     | 36 +++++++++++++++------
 pandas/tests/io/pytables/test_round_trip.py |  2 +-
 3 files changed, 29 insertions(+), 13 deletions(-)

diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py
index 56c09649587cc..b4c78b063c180 100644
--- a/pandas/io/pytables.py
+++ b/pandas/io/pytables.py
@@ -3524,14 +3524,12 @@ def validate(self, other) -> None:
                     # Value of type "Optional[Any]" is not indexable  [index]
                     oax = ov[i]  # type: ignore[index]
                     if sax != oax:
-                        ## Raise clearer error if mismatching type on values_axes
                         if c == "values_axes" and sax.kind != oax.kind:
-                            raise TypeError(
+                            raise ValueError(
                                 f"Cannot serialize the column [{oax.values[0]}] "
                                 f"because its data contents are not [{sax.kind}] "
                                 f"but [{oax.kind}] object dtype"
                             )
-                        # Fallback if other source of difference
                         raise ValueError(
                             f"invalid combination of [{c}] on appending data "
                             f"[{sax}] vs current table [{oax}]"
diff --git a/pandas/tests/io/pytables/test_append.py b/pandas/tests/io/pytables/test_append.py
index 9940adb39a8d3..04241a78bff5f 100644
--- a/pandas/tests/io/pytables/test_append.py
+++ b/pandas/tests/io/pytables/test_append.py
@@ -421,14 +421,6 @@ def check_col(key, name, size):
         with pytest.raises(ValueError, match=msg):
             store.append("df_new", df_new)
 
-        # bigger NaN representation on next append
-        df_new = DataFrame([[124, "a"], [346, "b"]])
-        store.append("df_new2", df_new)
-        df_new = DataFrame([[124, None], [346, "b"]])
-        msg = "NaN representation is too large for existing column size"
-        with pytest.raises(ValueError, match=msg):
-            store.append("df_new2", df_new)
-
         # min_itemsize on Series index (GH 11412)
         df = DataFrame(
             {
@@ -835,7 +827,7 @@ def test_append_raise(setup_path):
             "because its data contents are not [string] "
             "but [datetime64[s]] object dtype"
         )
-        with pytest.raises(TypeError, match=msg):
+        with pytest.raises(ValueError, match=msg):
             store.append("df", df)
 
 
@@ -1002,3 +994,29 @@ def test_append_to_multiple_min_itemsize(setup_path):
         )
         result = store.select_as_multiple(["index", "nums", "strs"])
         tm.assert_frame_equal(result, expected, check_index_type=True)
+
+
+def test_append_string_nan_rep(setup_path):
+    # GH 16300
+    df = DataFrame({"A": "a", "B": "foo"}, index=np.arange(10))
+    df_nan = df.copy()
+    df_nan.loc[0:4, :] = np.nan
+    msg = "NaN representation is too large for existing column size"
+
+    with ensure_clean_store(setup_path) as store:
+        # string column too small
+        store.append("sa", df["A"])
+        with pytest.raises(ValueError, match=msg):
+            store.append("sa", df_nan["A"])
+
+        # nan_rep too big
+        store.append("sb", df["B"], nan_rep="bars")
+        with pytest.raises(ValueError, match=msg):
+            store.append("sb", df_nan["B"])
+
+        # smaller modified nan_rep
+        store.append("sc", df["A"], nan_rep="n")
+        store.append("sc", df_nan["A"])
+        result = store["sc"]
+        expected = concat([df["A"], df_nan["A"]])
+        tm.assert_series_equal(result, expected)
diff --git a/pandas/tests/io/pytables/test_round_trip.py b/pandas/tests/io/pytables/test_round_trip.py
index bbb37eb64328b..875a792467828 100644
--- a/pandas/tests/io/pytables/test_round_trip.py
+++ b/pandas/tests/io/pytables/test_round_trip.py
@@ -217,7 +217,7 @@ def test_table_values_dtypes_roundtrip(setup_path):
             "because its data contents are not [float] "
             "but [integer] object dtype"
         )
-        with pytest.raises(TypeError, match=msg):
+        with pytest.raises(ValueError, match=msg):
             store.append("df_i8", df1)
 
         # check creation/storage/retrieval of float32 (a bit hacky to