From b6b60370c625051fb89254a8c7316bb8b0659cb0 Mon Sep 17 00:00:00 2001
From: Jake Thomas Trevallion <136272202+JakeTT404@users.noreply.github.com>
Date: Wed, 5 Feb 2025 17:48:56 +0000
Subject: [PATCH 1/5] ENH: Improved error message and raise new error for
 small-string NaN edge case in HDFStore.append (#60829)

* Add clearer error messages for datatype mismatch in HDFStore.append. Raise ValueError when nan_rep too large for pytable column. Add and modify applicable test code.

* Fix missed tests and correct mistake in error message.

* Remove excess comments. Reverse error type change to avoid api changes. Move nan_rep tests into separate function.

(cherry picked from commit 57340ecd08580f26ee4a976c1f68b2f563c41569)
---
 pandas/io/pytables.py                       |  9 ++++++
 pandas/tests/io/pytables/test_append.py     | 35 +++++++++++++++++----
 pandas/tests/io/pytables/test_round_trip.py |  9 ++----
 3 files changed, 41 insertions(+), 12 deletions(-)

diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py
index dbe2db9f9625b..6d5202c58a0d5 100644
--- a/pandas/io/pytables.py
+++ b/pandas/io/pytables.py
@@ -3464,6 +3464,12 @@ def validate(self, other) -> None:
                     # Value of type "Optional[Any]" is not indexable  [index]
                     oax = ov[i]  # type: ignore[index]
                     if sax != oax:
+                        if c == "values_axes" and sax.kind != oax.kind:
+                            raise ValueError(
+                                f"Cannot serialize the column [{oax.values[0]}] "
+                                f"because its data contents are not [{sax.kind}] "
+                                f"but [{oax.kind}] object dtype"
+                            )
                         raise ValueError(
                             f"invalid combination of [{c}] on appending data "
                             f"[{sax}] vs current table [{oax}]"
@@ -5111,6 +5117,9 @@ def _maybe_convert_for_string_atom(
     data = bvalues.copy()
     data[mask] = nan_rep
 
+    if existing_col and mask.any() and len(nan_rep) > existing_col.itemsize:
+        raise ValueError("NaN representation is too large for existing column size")
+
     # see if we have a valid string type
     inferred_type = lib.infer_dtype(data, skipna=False)
     if inferred_type != "string":
diff --git a/pandas/tests/io/pytables/test_append.py b/pandas/tests/io/pytables/test_append.py
index 93e50455fe6a2..fd2deacb69b3c 100644
--- a/pandas/tests/io/pytables/test_append.py
+++ b/pandas/tests/io/pytables/test_append.py
@@ -818,12 +818,9 @@ def test_append_raise(setup_path):
         store.append("df", df)
         df["foo"] = "bar"
         msg = re.escape(
-            "invalid combination of [values_axes] on appending data "
-            "[name->values_block_1,cname->values_block_1,"
-            "dtype->bytes24,kind->string,shape->(1, 30)] "
-            "vs current table "
-            "[name->values_block_1,cname->values_block_1,"
-            "dtype->datetime64[s],kind->datetime64[s],shape->None]"
+            "Cannot serialize the column [foo] "
+            "because its data contents are not [string] "
+            "but [datetime64[s]] object dtype"
         )
         with pytest.raises(ValueError, match=msg):
             store.append("df", df)
@@ -989,3 +986,29 @@ def test_append_to_multiple_min_itemsize(setup_path):
         )
         result = store.select_as_multiple(["index", "nums", "strs"])
         tm.assert_frame_equal(result, expected, check_index_type=True)
+
+
+def test_append_string_nan_rep(setup_path):
+    # GH 16300
+    df = DataFrame({"A": "a", "B": "foo"}, index=np.arange(10))
+    df_nan = df.copy()
+    df_nan.loc[0:4, :] = np.nan
+    msg = "NaN representation is too large for existing column size"
+
+    with ensure_clean_store(setup_path) as store:
+        # string column too small
+        store.append("sa", df["A"])
+        with pytest.raises(ValueError, match=msg):
+            store.append("sa", df_nan["A"])
+
+        # nan_rep too big
+        store.append("sb", df["B"], nan_rep="bars")
+        with pytest.raises(ValueError, match=msg):
+            store.append("sb", df_nan["B"])
+
+        # smaller modified nan_rep
+        store.append("sc", df["A"], nan_rep="n")
+        store.append("sc", df_nan["A"])
+        result = store["sc"]
+        expected = concat([df["A"], df_nan["A"]])
+        tm.assert_series_equal(result, expected)
diff --git a/pandas/tests/io/pytables/test_round_trip.py b/pandas/tests/io/pytables/test_round_trip.py
index 2397d18b1019e..72d90b1273d65 100644
--- a/pandas/tests/io/pytables/test_round_trip.py
+++ b/pandas/tests/io/pytables/test_round_trip.py
@@ -213,12 +213,9 @@ def test_table_values_dtypes_roundtrip(setup_path):
 
         # incompatible dtype
         msg = re.escape(
-            "invalid combination of [values_axes] on appending data "
-            "[name->values_block_0,cname->values_block_0,"
-            "dtype->float64,kind->float,shape->(1, 3)] vs "
-            "current table [name->values_block_0,"
-            "cname->values_block_0,dtype->int64,kind->integer,"
-            "shape->None]"
+            "Cannot serialize the column [a] "
+            "because its data contents are not [float] "
+            "but [integer] object dtype"
         )
         with pytest.raises(ValueError, match=msg):
             store.append("df_i8", df1)

From 413563e614998262ff914c4e6dca65ebe32b048f Mon Sep 17 00:00:00 2001
From: Richard Shadrach <45562402+rhshadrach@users.noreply.github.com>
Date: Mon, 10 Feb 2025 09:23:52 -0500
Subject: [PATCH 2/5] TST(string dtype): Resolve xfails in pytables (#60795)

(cherry picked from commit 4511251ccf409f2ba71cab0283bdf751697ee539)
---
 pandas/io/pytables.py                         |  3 +
 pandas/tests/io/pytables/test_append.py       | 56 +++++++++-------
 pandas/tests/io/pytables/test_categorical.py  |  6 +-
 pandas/tests/io/pytables/test_complex.py      |  6 --
 pandas/tests/io/pytables/test_errors.py       | 18 ++---
 .../tests/io/pytables/test_file_handling.py   | 10 +--
 pandas/tests/io/pytables/test_keys.py         |  7 +-
 pandas/tests/io/pytables/test_put.py          |  4 +-
 pandas/tests/io/pytables/test_read.py         | 16 +++--
 pandas/tests/io/pytables/test_round_trip.py   | 49 ++++++++------
 pandas/tests/io/pytables/test_select.py       | 44 ++++++------
 pandas/tests/io/pytables/test_store.py        | 67 ++++++++++---------
 pandas/tests/io/pytables/test_timezones.py    |  6 --
 13 files changed, 143 insertions(+), 149 deletions(-)

diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py
index 6d5202c58a0d5..d93a3f26934a0 100644
--- a/pandas/io/pytables.py
+++ b/pandas/io/pytables.py
@@ -5093,6 +5093,9 @@ def _maybe_convert_for_string_atom(
     errors,
     columns: list[str],
 ):
+    if isinstance(bvalues.dtype, StringDtype):
+        # "ndarray[Any, Any]" has no attribute "to_numpy"
+        bvalues = bvalues.to_numpy()  # type: ignore[union-attr]
     if bvalues.dtype != object:
         return bvalues
 
diff --git a/pandas/tests/io/pytables/test_append.py b/pandas/tests/io/pytables/test_append.py
index fd2deacb69b3c..39c203c558a5b 100644
--- a/pandas/tests/io/pytables/test_append.py
+++ b/pandas/tests/io/pytables/test_append.py
@@ -25,10 +25,7 @@
     ensure_clean_store,
 )
 
-pytestmark = [
-    pytest.mark.single_cpu,
-    pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False),
-]
+pytestmark = [pytest.mark.single_cpu]
 
 tables = pytest.importorskip("tables")
 
@@ -40,7 +37,7 @@ def test_append(setup_path):
         # tables.NaturalNameWarning):
         df = DataFrame(
             np.random.default_rng(2).standard_normal((20, 4)),
-            columns=Index(list("ABCD"), dtype=object),
+            columns=Index(list("ABCD")),
             index=date_range("2000-01-01", periods=20, freq="B"),
         )
         _maybe_remove(store, "df1")
@@ -201,7 +198,7 @@ def test_append_some_nans(setup_path):
         tm.assert_frame_equal(store["df3"], df3, check_index_type=True)
 
 
-def test_append_all_nans(setup_path):
+def test_append_all_nans(setup_path, using_infer_string):
     with ensure_clean_store(setup_path) as store:
         df = DataFrame(
             {
@@ -253,7 +250,13 @@ def test_append_all_nans(setup_path):
             _maybe_remove(store, "df")
             store.append("df", df[:10], dropna=True)
             store.append("df", df[10:], dropna=True)
-            tm.assert_frame_equal(store["df"], df, check_index_type=True)
+            result = store["df"]
+            expected = df
+            if using_infer_string:
+                # TODO: Test is incorrect when not using_infer_string.
+                #       Should take the last 4 rows uncondiationally.
+                expected = expected[-4:]
+            tm.assert_frame_equal(result, expected, check_index_type=True)
 
             _maybe_remove(store, "df2")
             store.append("df2", df[:10], dropna=False)
@@ -292,7 +295,7 @@ def test_append_frame_column_oriented(setup_path):
         # column oriented
         df = DataFrame(
             np.random.default_rng(2).standard_normal((10, 4)),
-            columns=Index(list("ABCD"), dtype=object),
+            columns=Index(list("ABCD")),
             index=date_range("2000-01-01", periods=10, freq="B"),
         )
         df.index = df.index._with_freq(None)  # freq doesn't round-trip
@@ -417,7 +420,7 @@ def check_col(key, name, size):
             {
                 "A": [0.0, 1.0, 2.0, 3.0, 4.0],
                 "B": [0.0, 1.0, 0.0, 1.0, 0.0],
-                "C": Index(["foo1", "foo2", "foo3", "foo4", "foo5"], dtype=object),
+                "C": Index(["foo1", "foo2", "foo3", "foo4", "foo5"]),
                 "D": date_range("20130101", periods=5),
             }
         ).set_index("C")
@@ -444,7 +447,7 @@ def check_col(key, name, size):
         _maybe_remove(store, "df")
         df = DataFrame(
             np.random.default_rng(2).standard_normal((10, 4)),
-            columns=Index(list("ABCD"), dtype=object),
+            columns=Index(list("ABCD")),
             index=date_range("2000-01-01", periods=10, freq="B"),
         )
         df["string"] = "foo"
@@ -504,11 +507,12 @@ def test_append_with_empty_string(setup_path):
         tm.assert_frame_equal(store.select("df"), df)
 
 
+@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
 def test_append_with_data_columns(setup_path):
     with ensure_clean_store(setup_path) as store:
         df = DataFrame(
             np.random.default_rng(2).standard_normal((10, 4)),
-            columns=Index(list("ABCD"), dtype=object),
+            columns=Index(list("ABCD")),
             index=date_range("2000-01-01", periods=10, freq="B"),
         )
         df.iloc[0, df.columns.get_loc("B")] = 1.0
@@ -684,8 +688,8 @@ def test_append_misc(setup_path):
     with ensure_clean_store(setup_path) as store:
         df = DataFrame(
             1.1 * np.arange(120).reshape((30, 4)),
-            columns=Index(list("ABCD"), dtype=object),
-            index=Index([f"i-{i}" for i in range(30)], dtype=object),
+            columns=Index(list("ABCD")),
+            index=Index([f"i-{i}" for i in range(30)]),
         )
         store.append("df", df, chunksize=1)
         result = store.select("df")
@@ -701,8 +705,8 @@ def test_append_misc_chunksize(setup_path, chunksize):
     # more chunksize in append tests
     df = DataFrame(
         1.1 * np.arange(120).reshape((30, 4)),
-        columns=Index(list("ABCD"), dtype=object),
-        index=Index([f"i-{i}" for i in range(30)], dtype=object),
+        columns=Index(list("ABCD")),
+        index=Index([f"i-{i}" for i in range(30)]),
     )
     df["string"] = "foo"
     df["float322"] = 1.0
@@ -742,15 +746,15 @@ def test_append_misc_empty_frame(setup_path):
 # the conversion from AM->BM converts the invalid object dtype column into
 # a datetime64 column no longer raising an error
 @td.skip_array_manager_not_yet_implemented
-def test_append_raise(setup_path):
+def test_append_raise(setup_path, using_infer_string):
     with ensure_clean_store(setup_path) as store:
         # test append with invalid input to get good error messages
 
         # list in column
         df = DataFrame(
             1.1 * np.arange(120).reshape((30, 4)),
-            columns=Index(list("ABCD"), dtype=object),
-            index=Index([f"i-{i}" for i in range(30)], dtype=object),
+            columns=Index(list("ABCD")),
+            index=Index([f"i-{i}" for i in range(30)]),
         )
         df["invalid"] = [["a"]] * len(df)
         assert df.dtypes["invalid"] == np.object_
@@ -770,8 +774,8 @@ def test_append_raise(setup_path):
         # datetime with embedded nans as object
         df = DataFrame(
             1.1 * np.arange(120).reshape((30, 4)),
-            columns=Index(list("ABCD"), dtype=object),
-            index=Index([f"i-{i}" for i in range(30)], dtype=object),
+            columns=Index(list("ABCD")),
+            index=Index([f"i-{i}" for i in range(30)]),
         )
         s = Series(datetime.datetime(2001, 1, 2), index=df.index)
         s = s.astype(object)
@@ -798,8 +802,8 @@ def test_append_raise(setup_path):
         # appending an incompatible table
         df = DataFrame(
             1.1 * np.arange(120).reshape((30, 4)),
-            columns=Index(list("ABCD"), dtype=object),
-            index=Index([f"i-{i}" for i in range(30)], dtype=object),
+            columns=Index(list("ABCD")),
+            index=Index([f"i-{i}" for i in range(30)]),
         )
         store.append("df", df)
 
@@ -876,7 +880,7 @@ def test_append_with_timedelta(setup_path):
 def test_append_to_multiple(setup_path):
     df1 = DataFrame(
         np.random.default_rng(2).standard_normal((10, 4)),
-        columns=Index(list("ABCD"), dtype=object),
+        columns=Index(list("ABCD")),
         index=date_range("2000-01-01", periods=10, freq="B"),
     )
     df2 = df1.copy().rename(columns="{}_2".format)
@@ -913,12 +917,12 @@ def test_append_to_multiple(setup_path):
 def test_append_to_multiple_dropna(setup_path):
     df1 = DataFrame(
         np.random.default_rng(2).standard_normal((10, 4)),
-        columns=Index(list("ABCD"), dtype=object),
+        columns=Index(list("ABCD")),
         index=date_range("2000-01-01", periods=10, freq="B"),
     )
     df2 = DataFrame(
         np.random.default_rng(2).standard_normal((10, 4)),
-        columns=Index(list("ABCD"), dtype=object),
+        columns=Index(list("ABCD")),
         index=date_range("2000-01-01", periods=10, freq="B"),
     ).rename(columns="{}_2".format)
     df1.iloc[1, df1.columns.get_indexer(["A", "B"])] = np.nan
@@ -938,7 +942,7 @@ def test_append_to_multiple_dropna(setup_path):
 def test_append_to_multiple_dropna_false(setup_path):
     df1 = DataFrame(
         np.random.default_rng(2).standard_normal((10, 4)),
-        columns=Index(list("ABCD"), dtype=object),
+        columns=Index(list("ABCD")),
         index=date_range("2000-01-01", periods=10, freq="B"),
     )
     df2 = df1.copy().rename(columns="{}_2".format)
diff --git a/pandas/tests/io/pytables/test_categorical.py b/pandas/tests/io/pytables/test_categorical.py
index 07c797467e5e2..a875e19ea7f0e 100644
--- a/pandas/tests/io/pytables/test_categorical.py
+++ b/pandas/tests/io/pytables/test_categorical.py
@@ -16,10 +16,7 @@
     ensure_clean_store,
 )
 
-pytestmark = [
-    pytest.mark.single_cpu,
-    pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False),
-]
+pytestmark = [pytest.mark.single_cpu]
 
 
 def test_categorical(setup_path):
@@ -143,6 +140,7 @@ def test_categorical(setup_path):
             store.select("df3/meta/s/meta")
 
 
+@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
 def test_categorical_conversion(tmp_path, setup_path):
     # GH13322
     # Check that read_hdf with categorical columns doesn't return rows if
diff --git a/pandas/tests/io/pytables/test_complex.py b/pandas/tests/io/pytables/test_complex.py
index d140cfc941e16..c5cac5a5caf09 100644
--- a/pandas/tests/io/pytables/test_complex.py
+++ b/pandas/tests/io/pytables/test_complex.py
@@ -1,8 +1,6 @@
 import numpy as np
 import pytest
 
-from pandas._config import using_string_dtype
-
 import pandas as pd
 from pandas import (
     DataFrame,
@@ -13,10 +11,6 @@
 
 from pandas.io.pytables import read_hdf
 
-pytestmark = pytest.mark.xfail(
-    using_string_dtype(), reason="TODO(infer_string)", strict=False
-)
-
 
 def test_complex_fixed(tmp_path, setup_path):
     df = DataFrame(
diff --git a/pandas/tests/io/pytables/test_errors.py b/pandas/tests/io/pytables/test_errors.py
index c31b9989ef35e..b28101c09820f 100644
--- a/pandas/tests/io/pytables/test_errors.py
+++ b/pandas/tests/io/pytables/test_errors.py
@@ -5,8 +5,6 @@
 import numpy as np
 import pytest
 
-from pandas._config import using_string_dtype
-
 from pandas import (
     CategoricalIndex,
     DataFrame,
@@ -24,10 +22,7 @@
     _maybe_adjust_name,
 )
 
-pytestmark = [
-    pytest.mark.single_cpu,
-    pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False),
-]
+pytestmark = [pytest.mark.single_cpu]
 
 
 def test_pass_spec_to_storer(setup_path):
@@ -93,9 +88,14 @@ def test_unimplemented_dtypes_table_columns(setup_path):
 
     with ensure_clean_store(setup_path) as store:
         # this fails because we have a date in the object block......
-        msg = re.escape(
-            """Cannot serialize the column [datetime1]
-because its data contents are not [string] but [date] object dtype"""
+        msg = "|".join(
+            [
+                re.escape(
+                    "Cannot serialize the column [datetime1]\nbecause its data "
+                    "contents are not [string] but [date] object dtype"
+                ),
+                re.escape("[date] is not implemented as a table column"),
+            ]
         )
         with pytest.raises(TypeError, match=msg):
             store.append("df_unimplemented", df)
diff --git a/pandas/tests/io/pytables/test_file_handling.py b/pandas/tests/io/pytables/test_file_handling.py
index 1878f2a392e13..100a55e6e346d 100644
--- a/pandas/tests/io/pytables/test_file_handling.py
+++ b/pandas/tests/io/pytables/test_file_handling.py
@@ -3,8 +3,6 @@
 import numpy as np
 import pytest
 
-from pandas._config import using_string_dtype
-
 from pandas.compat import (
     PY311,
     is_ci_environment,
@@ -34,9 +32,7 @@
 from pandas.io import pytables
 from pandas.io.pytables import Term
 
-pytestmark = [
-    pytest.mark.single_cpu,
-]
+pytestmark = [pytest.mark.single_cpu]
 
 
 @pytest.mark.parametrize("mode", ["r", "r+", "a", "w"])
@@ -323,7 +319,6 @@ def test_complibs(tmp_path, lvl, lib, request):
                 assert node.filters.complib == lib
 
 
-@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False)
 @pytest.mark.skipif(
     not is_platform_little_endian(), reason="reason platform is not little endian"
 )
@@ -341,7 +336,6 @@ def test_encoding(setup_path):
         tm.assert_frame_equal(result, expected)
 
 
-@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False)
 @pytest.mark.parametrize(
     "val",
     [
@@ -356,7 +350,7 @@ def test_encoding(setup_path):
         [b"A\xf8\xfc", np.nan, b"", b"b", b"c"],
     ],
 )
-@pytest.mark.parametrize("dtype", ["category", object])
+@pytest.mark.parametrize("dtype", ["category", None])
 def test_latin_encoding(tmp_path, setup_path, dtype, val):
     enc = "latin-1"
     nan_rep = ""
diff --git a/pandas/tests/io/pytables/test_keys.py b/pandas/tests/io/pytables/test_keys.py
index 7d0802dcf2e47..9c5fc8786c7c6 100644
--- a/pandas/tests/io/pytables/test_keys.py
+++ b/pandas/tests/io/pytables/test_keys.py
@@ -1,8 +1,6 @@
 import numpy as np
 import pytest
 
-from pandas._config import using_string_dtype
-
 from pandas import (
     DataFrame,
     HDFStore,
@@ -15,10 +13,7 @@
     tables,
 )
 
-pytestmark = [
-    pytest.mark.single_cpu,
-    pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False),
-]
+pytestmark = [pytest.mark.single_cpu]
 
 
 def test_keys(setup_path):
diff --git a/pandas/tests/io/pytables/test_put.py b/pandas/tests/io/pytables/test_put.py
index 38f0379eb9a66..36ca68eb227a6 100644
--- a/pandas/tests/io/pytables/test_put.py
+++ b/pandas/tests/io/pytables/test_put.py
@@ -22,9 +22,7 @@
 )
 from pandas.util import _test_decorators as td
 
-pytestmark = [
-    pytest.mark.single_cpu,
-]
+pytestmark = [pytest.mark.single_cpu]
 
 
 def test_format_type(tmp_path, setup_path):
diff --git a/pandas/tests/io/pytables/test_read.py b/pandas/tests/io/pytables/test_read.py
index 28cd8aea1defc..bfebf18c0e0ab 100644
--- a/pandas/tests/io/pytables/test_read.py
+++ b/pandas/tests/io/pytables/test_read.py
@@ -28,10 +28,7 @@
 
 from pandas.io.pytables import TableIterator
 
-pytestmark = [
-    pytest.mark.single_cpu,
-    pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False),
-]
+pytestmark = [pytest.mark.single_cpu]
 
 
 def test_read_missing_key_close_store(tmp_path, setup_path):
@@ -77,10 +74,11 @@ def test_read_missing_key_opened_store(tmp_path, setup_path):
         read_hdf(store, "k1")
 
 
+@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
 def test_read_column(setup_path):
     df = DataFrame(
         np.random.default_rng(2).standard_normal((10, 4)),
-        columns=Index(list("ABCD"), dtype=object),
+        columns=Index(list("ABCD")),
         index=date_range("2000-01-01", periods=10, freq="B"),
     )
 
@@ -221,7 +219,7 @@ def test_legacy_table_read_py2(datapath):
     tm.assert_frame_equal(expected, result)
 
 
-def test_read_hdf_open_store(tmp_path, setup_path):
+def test_read_hdf_open_store(tmp_path, setup_path, using_infer_string):
     # GH10330
     # No check for non-string path_or-buf, and no test of open store
     df = DataFrame(
@@ -233,6 +231,12 @@ def test_read_hdf_open_store(tmp_path, setup_path):
     df = df.set_index(keys="E", append=True)
 
     path = tmp_path / setup_path
+    if using_infer_string:
+        # TODO(infer_string) make this work for string dtype
+        msg = "Saving a MultiIndex with an extension dtype is not supported."
+        with pytest.raises(NotImplementedError, match=msg):
+            df.to_hdf(path, key="df", mode="w")
+        return
     df.to_hdf(path, key="df", mode="w")
     direct = read_hdf(path, "df")
     with HDFStore(path, mode="r") as store:
diff --git a/pandas/tests/io/pytables/test_round_trip.py b/pandas/tests/io/pytables/test_round_trip.py
index 72d90b1273d65..040708c9cedd0 100644
--- a/pandas/tests/io/pytables/test_round_trip.py
+++ b/pandas/tests/io/pytables/test_round_trip.py
@@ -4,8 +4,6 @@
 import numpy as np
 import pytest
 
-from pandas._config import using_string_dtype
-
 from pandas._libs.tslibs import Timestamp
 from pandas.compat import is_platform_windows
 
@@ -26,10 +24,7 @@
 )
 from pandas.util import _test_decorators as td
 
-pytestmark = [
-    pytest.mark.single_cpu,
-    pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False),
-]
+pytestmark = [pytest.mark.single_cpu]
 
 
 def test_conv_read_write():
@@ -49,8 +44,8 @@ def roundtrip(key, obj, **kwargs):
 
         o = DataFrame(
             1.1 * np.arange(120).reshape((30, 4)),
-            columns=Index(list("ABCD"), dtype=object),
-            index=Index([f"i-{i}" for i in range(30)], dtype=object),
+            columns=Index(list("ABCD")),
+            index=Index([f"i-{i}" for i in range(30)]),
         )
         tm.assert_frame_equal(o, roundtrip("frame", o))
 
@@ -150,8 +145,8 @@ def test_api_invalid(tmp_path, setup_path):
     # Invalid.
     df = DataFrame(
         1.1 * np.arange(120).reshape((30, 4)),
-        columns=Index(list("ABCD"), dtype=object),
-        index=Index([f"i-{i}" for i in range(30)], dtype=object),
+        columns=Index(list("ABCD")),
+        index=Index([f"i-{i}" for i in range(30)]),
     )
 
     msg = "Can only append to Tables"
@@ -201,7 +196,7 @@ def test_put_integer(setup_path):
     _check_roundtrip(df, tm.assert_frame_equal, setup_path)
 
 
-def test_table_values_dtypes_roundtrip(setup_path):
+def test_table_values_dtypes_roundtrip(setup_path, using_infer_string):
     with ensure_clean_store(setup_path) as store:
         df1 = DataFrame({"a": [1, 2, 3]}, dtype="f8")
         store.append("df_f8", df1)
@@ -244,6 +239,7 @@ def test_table_values_dtypes_roundtrip(setup_path):
         store.append("df_mixed_dtypes1", df1)
         result = store.select("df_mixed_dtypes1").dtypes.value_counts()
         result.index = [str(i) for i in result.index]
+        str_dtype = "str" if using_infer_string else "object"
         expected = Series(
             {
                 "float32": 2,
@@ -253,7 +249,7 @@ def test_table_values_dtypes_roundtrip(setup_path):
                 "int16": 1,
                 "int8": 1,
                 "int64": 1,
-                "object": 1,
+                str_dtype: 1,
                 "datetime64[ns]": 2,
             },
             name="count",
@@ -273,10 +269,10 @@ def test_series(setup_path):
     )
     _check_roundtrip(ts, tm.assert_series_equal, path=setup_path)
 
-    ts2 = Series(ts.index, Index(ts.index, dtype=object))
+    ts2 = Series(ts.index, Index(ts.index))
     _check_roundtrip(ts2, tm.assert_series_equal, path=setup_path)
 
-    ts3 = Series(ts.values, Index(np.asarray(ts.index, dtype=object), dtype=object))
+    ts3 = Series(ts.values, Index(np.asarray(ts.index)))
     _check_roundtrip(
         ts3, tm.assert_series_equal, path=setup_path, check_index_type=False
     )
@@ -366,8 +362,8 @@ def test_timeseries_preepoch(setup_path, request):
 def test_frame(compression, setup_path):
     df = DataFrame(
         1.1 * np.arange(120).reshape((30, 4)),
-        columns=Index(list("ABCD"), dtype=object),
-        index=Index([f"i-{i}" for i in range(30)], dtype=object),
+        columns=Index(list("ABCD")),
+        index=Index([f"i-{i}" for i in range(30)]),
     )
 
     # put in some random NAs
@@ -383,7 +379,7 @@ def test_frame(compression, setup_path):
 
     tdf = DataFrame(
         np.random.default_rng(2).standard_normal((10, 4)),
-        columns=Index(list("ABCD"), dtype=object),
+        columns=Index(list("ABCD")),
         index=date_range("2000-01-01", periods=10, freq="B"),
     )
     _check_roundtrip(
@@ -398,7 +394,10 @@ def test_frame(compression, setup_path):
         assert recons._mgr.is_consolidated()
 
     # empty
-    _check_roundtrip(df[:0], tm.assert_frame_equal, path=setup_path)
+    df2 = df[:0]
+    # Prevent df2 from having index with inferred_type as string
+    df2.index = Index([])
+    _check_roundtrip(df2[:0], tm.assert_frame_equal, path=setup_path)
 
 
 def test_empty_series_frame(setup_path):
@@ -430,9 +429,17 @@ def test_can_serialize_dates(setup_path):
     _check_roundtrip(frame, tm.assert_frame_equal, path=setup_path)
 
 
-def test_store_hierarchical(setup_path, multiindex_dataframe_random_data):
+def test_store_hierarchical(
+    setup_path, using_infer_string, multiindex_dataframe_random_data
+):
     frame = multiindex_dataframe_random_data
 
+    if using_infer_string:
+        # TODO(infer_string) make this work for string dtype
+        msg = "Saving a MultiIndex with an extension dtype is not supported."
+        with pytest.raises(NotImplementedError, match=msg):
+            _check_roundtrip(frame, tm.assert_frame_equal, path=setup_path)
+        return
     _check_roundtrip(frame, tm.assert_frame_equal, path=setup_path)
     _check_roundtrip(frame.T, tm.assert_frame_equal, path=setup_path)
     _check_roundtrip(frame["A"], tm.assert_series_equal, path=setup_path)
@@ -451,8 +458,8 @@ def test_store_mixed(compression, setup_path):
     def _make_one():
         df = DataFrame(
             1.1 * np.arange(120).reshape((30, 4)),
-            columns=Index(list("ABCD"), dtype=object),
-            index=Index([f"i-{i}" for i in range(30)], dtype=object),
+            columns=Index(list("ABCD")),
+            index=Index([f"i-{i}" for i in range(30)]),
         )
         df["obj1"] = "foo"
         df["obj2"] = "bar"
diff --git a/pandas/tests/io/pytables/test_select.py b/pandas/tests/io/pytables/test_select.py
index 9f403f8293aed..f781b6756fec9 100644
--- a/pandas/tests/io/pytables/test_select.py
+++ b/pandas/tests/io/pytables/test_select.py
@@ -26,10 +26,7 @@
 
 from pandas.io.pytables import Term
 
-pytestmark = [
-    pytest.mark.single_cpu,
-    pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False),
-]
+pytestmark = [pytest.mark.single_cpu]
 
 
 def test_select_columns_in_where(setup_path):
@@ -137,7 +134,7 @@ def test_select(setup_path):
         # select with columns=
         df = DataFrame(
             np.random.default_rng(2).standard_normal((10, 4)),
-            columns=Index(list("ABCD"), dtype=object),
+            columns=Index(list("ABCD")),
             index=date_range("2000-01-01", periods=10, freq="B"),
         )
         _maybe_remove(store, "df")
@@ -277,8 +274,8 @@ def test_select_dtypes(setup_path):
     with ensure_clean_store(setup_path) as store:
         df = DataFrame(
             1.1 * np.arange(120).reshape((30, 4)),
-            columns=Index(list("ABCD"), dtype=object),
-            index=Index([f"i-{i}" for i in range(30)], dtype=object),
+            columns=Index(list("ABCD")),
+            index=Index([f"i-{i}" for i in range(30)]),
         )
 
         expected = df[df["A"] > 0]
@@ -342,7 +339,7 @@ def test_select_iterator(tmp_path, setup_path):
     with ensure_clean_store(setup_path) as store:
         df = DataFrame(
             np.random.default_rng(2).standard_normal((10, 4)),
-            columns=Index(list("ABCD"), dtype=object),
+            columns=Index(list("ABCD")),
             index=date_range("2000-01-01", periods=10, freq="B"),
         )
         _maybe_remove(store, "df")
@@ -367,7 +364,7 @@ def test_select_iterator(tmp_path, setup_path):
 
     df = DataFrame(
         np.random.default_rng(2).standard_normal((10, 4)),
-        columns=Index(list("ABCD"), dtype=object),
+        columns=Index(list("ABCD")),
         index=date_range("2000-01-01", periods=10, freq="B"),
     )
     df.to_hdf(path, key="df_non_table")
@@ -383,7 +380,7 @@ def test_select_iterator(tmp_path, setup_path):
 
     df = DataFrame(
         np.random.default_rng(2).standard_normal((10, 4)),
-        columns=Index(list("ABCD"), dtype=object),
+        columns=Index(list("ABCD")),
         index=date_range("2000-01-01", periods=10, freq="B"),
     )
     df.to_hdf(path, key="df", format="table")
@@ -400,7 +397,7 @@ def test_select_iterator(tmp_path, setup_path):
     with ensure_clean_store(setup_path) as store:
         df1 = DataFrame(
             np.random.default_rng(2).standard_normal((10, 4)),
-            columns=Index(list("ABCD"), dtype=object),
+            columns=Index(list("ABCD")),
             index=date_range("2000-01-01", periods=10, freq="B"),
         )
         store.append("df1", df1, data_columns=True)
@@ -428,7 +425,7 @@ def test_select_iterator_complete_8014(setup_path):
     with ensure_clean_store(setup_path) as store:
         expected = DataFrame(
             np.random.default_rng(2).standard_normal((100064, 4)),
-            columns=Index(list("ABCD"), dtype=object),
+            columns=Index(list("ABCD")),
             index=date_range("2000-01-01", periods=100064, freq="s"),
         )
         _maybe_remove(store, "df")
@@ -463,7 +460,7 @@ def test_select_iterator_complete_8014(setup_path):
     with ensure_clean_store(setup_path) as store:
         expected = DataFrame(
             np.random.default_rng(2).standard_normal((100064, 4)),
-            columns=Index(list("ABCD"), dtype=object),
+            columns=Index(list("ABCD")),
             index=date_range("2000-01-01", periods=100064, freq="s"),
         )
         _maybe_remove(store, "df")
@@ -505,7 +502,7 @@ def test_select_iterator_non_complete_8014(setup_path):
     with ensure_clean_store(setup_path) as store:
         expected = DataFrame(
             np.random.default_rng(2).standard_normal((100064, 4)),
-            columns=Index(list("ABCD"), dtype=object),
+            columns=Index(list("ABCD")),
             index=date_range("2000-01-01", periods=100064, freq="s"),
         )
         _maybe_remove(store, "df")
@@ -539,7 +536,7 @@ def test_select_iterator_non_complete_8014(setup_path):
     with ensure_clean_store(setup_path) as store:
         expected = DataFrame(
             np.random.default_rng(2).standard_normal((100064, 4)),
-            columns=Index(list("ABCD"), dtype=object),
+            columns=Index(list("ABCD")),
             index=date_range("2000-01-01", periods=100064, freq="s"),
         )
         _maybe_remove(store, "df")
@@ -563,7 +560,7 @@ def test_select_iterator_many_empty_frames(setup_path):
     with ensure_clean_store(setup_path) as store:
         expected = DataFrame(
             np.random.default_rng(2).standard_normal((100064, 4)),
-            columns=Index(list("ABCD"), dtype=object),
+            columns=Index(list("ABCD")),
             index=date_range("2000-01-01", periods=100064, freq="s"),
         )
         _maybe_remove(store, "df")
@@ -615,7 +612,7 @@ def test_select_iterator_many_empty_frames(setup_path):
 def test_frame_select(setup_path):
     df = DataFrame(
         np.random.default_rng(2).standard_normal((10, 4)),
-        columns=Index(list("ABCD"), dtype=object),
+        columns=Index(list("ABCD")),
         index=date_range("2000-01-01", periods=10, freq="B"),
     )
 
@@ -640,7 +637,7 @@ def test_frame_select(setup_path):
         # invalid terms
         df = DataFrame(
             np.random.default_rng(2).standard_normal((10, 4)),
-            columns=Index(list("ABCD"), dtype=object),
+            columns=Index(list("ABCD")),
             index=date_range("2000-01-01", periods=10, freq="B"),
         )
         store.append("df_time", df)
@@ -654,12 +651,13 @@ def test_frame_select(setup_path):
         #     store.select('frame', [crit1, crit2])
 
 
+@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
 def test_frame_select_complex(setup_path):
     # select via complex criteria
 
     df = DataFrame(
         np.random.default_rng(2).standard_normal((10, 4)),
-        columns=Index(list("ABCD"), dtype=object),
+        columns=Index(list("ABCD")),
         index=date_range("2000-01-01", periods=10, freq="B"),
     )
     df["string"] = "foo"
@@ -776,7 +774,7 @@ def test_invalid_filtering(setup_path):
 
     df = DataFrame(
         np.random.default_rng(2).standard_normal((10, 4)),
-        columns=Index(list("ABCD"), dtype=object),
+        columns=Index(list("ABCD")),
         index=date_range("2000-01-01", periods=10, freq="B"),
     )
 
@@ -798,7 +796,7 @@ def test_string_select(setup_path):
     with ensure_clean_store(setup_path) as store:
         df = DataFrame(
             np.random.default_rng(2).standard_normal((10, 4)),
-            columns=Index(list("ABCD"), dtype=object),
+            columns=Index(list("ABCD")),
             index=date_range("2000-01-01", periods=10, freq="B"),
         )
 
@@ -842,7 +840,7 @@ def test_string_select(setup_path):
 def test_select_as_multiple(setup_path):
     df1 = DataFrame(
         np.random.default_rng(2).standard_normal((10, 4)),
-        columns=Index(list("ABCD"), dtype=object),
+        columns=Index(list("ABCD")),
         index=date_range("2000-01-01", periods=10, freq="B"),
     )
     df2 = df1.copy().rename(columns="{}_2".format)
@@ -967,6 +965,7 @@ def test_query_long_float_literal(setup_path):
         tm.assert_frame_equal(expected, result)
 
 
+@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
 def test_query_compare_column_type(setup_path):
     # GH 15492
     df = DataFrame(
@@ -1043,7 +1042,6 @@ def test_select_large_integer(tmp_path):
         ),
         columns=["x", "y"],
     )
-    result = None
     with HDFStore(path) as s:
         s.append("data", df, data_columns=True, index=False)
         result = s.select("data", where="y==-9223372036854775801").get("y").get(0)
diff --git a/pandas/tests/io/pytables/test_store.py b/pandas/tests/io/pytables/test_store.py
index 8a33cccf62fcf..c349d2143ad11 100644
--- a/pandas/tests/io/pytables/test_store.py
+++ b/pandas/tests/io/pytables/test_store.py
@@ -9,6 +9,8 @@
 
 from pandas._config import using_string_dtype
 
+from pandas.compat import HAS_PYARROW
+
 import pandas as pd
 from pandas import (
     DataFrame,
@@ -33,10 +35,7 @@
     read_hdf,
 )
 
-pytestmark = [
-    pytest.mark.single_cpu,
-    pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False),
-]
+pytestmark = [pytest.mark.single_cpu]
 
 tables = pytest.importorskip("tables")
 
@@ -108,7 +107,7 @@ def test_iter_empty(setup_path):
         assert list(store) == []
 
 
-def test_repr(setup_path):
+def test_repr(setup_path, using_infer_string):
     with ensure_clean_store(setup_path) as store:
         repr(store)
         store.info()
@@ -143,7 +142,9 @@ def test_repr(setup_path):
         df.loc[df.index[3:6], ["obj1"]] = np.nan
         df = df._consolidate()
 
-        with tm.assert_produces_warning(pd.errors.PerformanceWarning):
+        warning = None if using_infer_string else pd.errors.PerformanceWarning
+        msg = "cannot\nmap directly to c-types .* dtype='object'"
+        with tm.assert_produces_warning(warning, match=msg):
             store["df"] = df
 
         # make a random group in hdf space
@@ -314,7 +315,7 @@ def test_getattr(setup_path):
 
         df = DataFrame(
             np.random.default_rng(2).standard_normal((10, 4)),
-            columns=Index(list("ABCD"), dtype=object),
+            columns=Index(list("ABCD")),
             index=date_range("2000-01-01", periods=10, freq="B"),
         )
         store["df"] = df
@@ -381,7 +382,7 @@ def test_to_hdf_with_min_itemsize(tmp_path, setup_path):
         {
             "A": [0.0, 1.0, 2.0, 3.0, 4.0],
             "B": [0.0, 1.0, 0.0, 1.0, 0.0],
-            "C": Index(["foo1", "foo2", "foo3", "foo4", "foo5"], dtype=object),
+            "C": Index(["foo1", "foo2", "foo3", "foo4", "foo5"]),
             "D": date_range("20130101", periods=5),
         }
     ).set_index("C")
@@ -397,6 +398,10 @@ def test_to_hdf_with_min_itemsize(tmp_path, setup_path):
     tm.assert_series_equal(read_hdf(path, "ss4"), concat([df["B"], df2["B"]]))
 
 
+@pytest.mark.xfail(
+    using_string_dtype() and HAS_PYARROW,
+    reason="TODO(infer_string): can't encode '\ud800': surrogates not allowed",
+)
 @pytest.mark.parametrize("format", ["fixed", "table"])
 def test_to_hdf_errors(tmp_path, format, setup_path):
     data = ["\ud800foo"]
@@ -418,7 +423,7 @@ def col(t, column):
         # data columns
         df = DataFrame(
             np.random.default_rng(2).standard_normal((10, 4)),
-            columns=Index(list("ABCD"), dtype=object),
+            columns=Index(list("ABCD")),
             index=date_range("2000-01-01", periods=10, freq="B"),
         )
         df["string"] = "foo"
@@ -453,7 +458,7 @@ def col(t, column):
         # data columns
         df = DataFrame(
             np.random.default_rng(2).standard_normal((10, 4)),
-            columns=Index(list("ABCD"), dtype=object),
+            columns=Index(list("ABCD")),
             index=date_range("2000-01-01", periods=10, freq="B"),
         )
         df["string"] = "foo"
@@ -495,8 +500,8 @@ def test_table_mixed_dtypes(setup_path):
     # frame
     df = DataFrame(
         1.1 * np.arange(120).reshape((30, 4)),
-        columns=Index(list("ABCD"), dtype=object),
-        index=Index([f"i-{i}" for i in range(30)], dtype=object),
+        columns=Index(list("ABCD")),
+        index=Index([f"i-{i}" for i in range(30)]),
     )
     df["obj1"] = "foo"
     df["obj2"] = "bar"
@@ -551,8 +556,8 @@ def test_remove(setup_path):
         )
         df = DataFrame(
             1.1 * np.arange(120).reshape((30, 4)),
-            columns=Index(list("ABCD"), dtype=object),
-            index=Index([f"i-{i}" for i in range(30)], dtype=object),
+            columns=Index(list("ABCD")),
+            index=Index([f"i-{i}" for i in range(30)]),
         )
         store["a"] = ts
         store["b"] = df
@@ -615,8 +620,8 @@ def test_same_name_scoping(setup_path):
 def test_store_index_name(setup_path):
     df = DataFrame(
         1.1 * np.arange(120).reshape((30, 4)),
-        columns=Index(list("ABCD"), dtype=object),
-        index=Index([f"i-{i}" for i in range(30)], dtype=object),
+        columns=Index(list("ABCD")),
+        index=Index([f"i-{i}" for i in range(30)]),
     )
     df.index.name = "foo"
 
@@ -658,8 +663,8 @@ def test_store_index_name_numpy_str(tmp_path, table_format, setup_path, unit, tz
 def test_store_series_name(setup_path):
     df = DataFrame(
         1.1 * np.arange(120).reshape((30, 4)),
-        columns=Index(list("ABCD"), dtype=object),
-        index=Index([f"i-{i}" for i in range(30)], dtype=object),
+        columns=Index(list("ABCD")),
+        index=Index([f"i-{i}" for i in range(30)]),
     )
     series = df["A"]
 
@@ -673,7 +678,7 @@ def test_overwrite_node(setup_path):
     with ensure_clean_store(setup_path) as store:
         store["a"] = DataFrame(
             np.random.default_rng(2).standard_normal((10, 4)),
-            columns=Index(list("ABCD"), dtype=object),
+            columns=Index(list("ABCD")),
             index=date_range("2000-01-01", periods=10, freq="B"),
         )
         ts = Series(
@@ -687,7 +692,7 @@ def test_overwrite_node(setup_path):
 def test_coordinates(setup_path):
     df = DataFrame(
         np.random.default_rng(2).standard_normal((10, 4)),
-        columns=Index(list("ABCD"), dtype=object),
+        columns=Index(list("ABCD")),
         index=date_range("2000-01-01", periods=10, freq="B"),
     )
 
@@ -722,7 +727,7 @@ def test_coordinates(setup_path):
         _maybe_remove(store, "df2")
         df1 = DataFrame(
             np.random.default_rng(2).standard_normal((10, 4)),
-            columns=Index(list("ABCD"), dtype=object),
+            columns=Index(list("ABCD")),
             index=date_range("2000-01-01", periods=10, freq="B"),
         )
         df2 = df1.copy().rename(columns="{}_2".format)
@@ -878,8 +883,8 @@ def test_start_stop_fixed(setup_path):
         # sparse; not implemented
         df = DataFrame(
             1.1 * np.arange(120).reshape((30, 4)),
-            columns=Index(list("ABCD"), dtype=object),
-            index=Index([f"i-{i}" for i in range(30)], dtype=object),
+            columns=Index(list("ABCD")),
+            index=Index([f"i-{i}" for i in range(30)]),
         )
         df.iloc[3:5, 1:3] = np.nan
         df.iloc[8:10, -2] = np.nan
@@ -905,8 +910,8 @@ def test_select_filter_corner(setup_path):
 def test_path_pathlib():
     df = DataFrame(
         1.1 * np.arange(120).reshape((30, 4)),
-        columns=Index(list("ABCD"), dtype=object),
-        index=Index([f"i-{i}" for i in range(30)], dtype=object),
+        columns=Index(list("ABCD")),
+        index=Index([f"i-{i}" for i in range(30)]),
     )
 
     result = tm.round_trip_pathlib(
@@ -935,8 +940,8 @@ def test_contiguous_mixed_data_table(start, stop, setup_path):
 def test_path_pathlib_hdfstore():
     df = DataFrame(
         1.1 * np.arange(120).reshape((30, 4)),
-        columns=Index(list("ABCD"), dtype=object),
-        index=Index([f"i-{i}" for i in range(30)], dtype=object),
+        columns=Index(list("ABCD")),
+        index=Index([f"i-{i}" for i in range(30)]),
     )
 
     def writer(path):
@@ -954,8 +959,8 @@ def reader(path):
 def test_pickle_path_localpath():
     df = DataFrame(
         1.1 * np.arange(120).reshape((30, 4)),
-        columns=Index(list("ABCD"), dtype=object),
-        index=Index([f"i-{i}" for i in range(30)], dtype=object),
+        columns=Index(list("ABCD")),
+        index=Index([f"i-{i}" for i in range(30)]),
     )
     result = tm.round_trip_pathlib(
         lambda p: df.to_hdf(p, key="df"), lambda p: read_hdf(p, "df")
@@ -986,8 +991,8 @@ def reader(path):
 def test_copy(propindexes):
     df = DataFrame(
         1.1 * np.arange(120).reshape((30, 4)),
-        columns=Index(list("ABCD"), dtype=object),
-        index=Index([f"i-{i}" for i in range(30)], dtype=object),
+        columns=Index(list("ABCD")),
+        index=Index([f"i-{i}" for i in range(30)]),
     )
 
     with tm.ensure_clean() as path:
diff --git a/pandas/tests/io/pytables/test_timezones.py b/pandas/tests/io/pytables/test_timezones.py
index 05d630dc0e47c..c5613daf62207 100644
--- a/pandas/tests/io/pytables/test_timezones.py
+++ b/pandas/tests/io/pytables/test_timezones.py
@@ -6,8 +6,6 @@
 import numpy as np
 import pytest
 
-from pandas._config import using_string_dtype
-
 from pandas._libs.tslibs.timezones import maybe_get_tz
 import pandas.util._test_decorators as td
 
@@ -25,10 +23,6 @@
     ensure_clean_store,
 )
 
-pytestmark = pytest.mark.xfail(
-    using_string_dtype(), reason="TODO(infer_string)", strict=False
-)
-
 
 def _compare_with_tz(a, b):
     tm.assert_frame_equal(a, b)

From e0f47b77557bc990211c492a72405ecc2b0ff5f2 Mon Sep 17 00:00:00 2001
From: Richard Shadrach <45562402+rhshadrach@users.noreply.github.com>
Date: Mon, 17 Feb 2025 20:31:00 -0500
Subject: [PATCH 3/5] BUG(string dtype): Resolve pytables xfail when reading
 with condition (#60943)

(cherry picked from commit 0ec5f2668e9568d90595180d5ee925305ec7182e)
---
 pandas/io/pytables.py                        | 18 ++++++++++++++++--
 pandas/tests/io/pytables/test_append.py      |  3 ---
 pandas/tests/io/pytables/test_categorical.py |  3 ---
 pandas/tests/io/pytables/test_read.py        |  3 ---
 pandas/tests/io/pytables/test_select.py      |  4 ----
 5 files changed, 16 insertions(+), 15 deletions(-)

diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py
index d93a3f26934a0..65f95dab7b42f 100644
--- a/pandas/io/pytables.py
+++ b/pandas/io/pytables.py
@@ -4093,6 +4093,8 @@ def _create_axes(
                 ordered = data_converted.ordered
                 meta = "category"
                 metadata = np.asarray(data_converted.categories).ravel()
+            elif isinstance(blk.dtype, StringDtype):
+                meta = str(blk.dtype)
 
             data, dtype_name = _get_data_and_dtype_name(data_converted)
 
@@ -4360,7 +4362,9 @@ def read_column(
                     encoding=self.encoding,
                     errors=self.errors,
                 )
-                return Series(_set_tz(col_values[1], a.tz), name=column, copy=False)
+                cvs = _set_tz(col_values[1], a.tz)
+                dtype = getattr(self.table.attrs, f"{column}_meta", None)
+                return Series(cvs, name=column, copy=False, dtype=dtype)
 
         raise KeyError(f"column [{column}] not found in the table")
 
@@ -4708,8 +4712,18 @@ def read(
                 df = DataFrame._from_arrays([values], columns=cols_, index=index_)
             if not (using_string_dtype() and values.dtype.kind == "O"):
                 assert (df.dtypes == values.dtype).all(), (df.dtypes, values.dtype)
+
+            # If str / string dtype is stored in meta, use that.
+            converted = False
+            for column in cols_:
+                dtype = getattr(self.table.attrs, f"{column}_meta", None)
+                if dtype in ["str", "string"]:
+                    df[column] = df[column].astype(dtype)
+                    converted = True
+            # Otherwise try inference.
             if (
-                using_string_dtype()
+                not converted
+                and using_string_dtype()
                 and isinstance(values, np.ndarray)
                 and is_string_array(
                     values,
diff --git a/pandas/tests/io/pytables/test_append.py b/pandas/tests/io/pytables/test_append.py
index 39c203c558a5b..d0246c8f58d6a 100644
--- a/pandas/tests/io/pytables/test_append.py
+++ b/pandas/tests/io/pytables/test_append.py
@@ -5,8 +5,6 @@
 import numpy as np
 import pytest
 
-from pandas._config import using_string_dtype
-
 from pandas._libs.tslibs import Timestamp
 import pandas.util._test_decorators as td
 
@@ -507,7 +505,6 @@ def test_append_with_empty_string(setup_path):
         tm.assert_frame_equal(store.select("df"), df)
 
 
-@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
 def test_append_with_data_columns(setup_path):
     with ensure_clean_store(setup_path) as store:
         df = DataFrame(
diff --git a/pandas/tests/io/pytables/test_categorical.py b/pandas/tests/io/pytables/test_categorical.py
index a875e19ea7f0e..449bc5cf1fc57 100644
--- a/pandas/tests/io/pytables/test_categorical.py
+++ b/pandas/tests/io/pytables/test_categorical.py
@@ -1,8 +1,6 @@
 import numpy as np
 import pytest
 
-from pandas._config import using_string_dtype
-
 from pandas import (
     Categorical,
     DataFrame,
@@ -140,7 +138,6 @@ def test_categorical(setup_path):
             store.select("df3/meta/s/meta")
 
 
-@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
 def test_categorical_conversion(tmp_path, setup_path):
     # GH13322
     # Check that read_hdf with categorical columns doesn't return rows if
diff --git a/pandas/tests/io/pytables/test_read.py b/pandas/tests/io/pytables/test_read.py
index bfebf18c0e0ab..5bec673ad3c70 100644
--- a/pandas/tests/io/pytables/test_read.py
+++ b/pandas/tests/io/pytables/test_read.py
@@ -5,8 +5,6 @@
 import numpy as np
 import pytest
 
-from pandas._config import using_string_dtype
-
 from pandas._libs.tslibs import Timestamp
 from pandas.compat import is_platform_windows
 
@@ -74,7 +72,6 @@ def test_read_missing_key_opened_store(tmp_path, setup_path):
         read_hdf(store, "k1")
 
 
-@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
 def test_read_column(setup_path):
     df = DataFrame(
         np.random.default_rng(2).standard_normal((10, 4)),
diff --git a/pandas/tests/io/pytables/test_select.py b/pandas/tests/io/pytables/test_select.py
index f781b6756fec9..e76934745f004 100644
--- a/pandas/tests/io/pytables/test_select.py
+++ b/pandas/tests/io/pytables/test_select.py
@@ -1,8 +1,6 @@
 import numpy as np
 import pytest
 
-from pandas._config import using_string_dtype
-
 from pandas._libs.tslibs import Timestamp
 
 import pandas as pd
@@ -651,7 +649,6 @@ def test_frame_select(setup_path):
         #     store.select('frame', [crit1, crit2])
 
 
-@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
 def test_frame_select_complex(setup_path):
     # select via complex criteria
 
@@ -965,7 +962,6 @@ def test_query_long_float_literal(setup_path):
         tm.assert_frame_equal(expected, result)
 
 
-@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
 def test_query_compare_column_type(setup_path):
     # GH 15492
     df = DataFrame(

From 3d5c84b5f9a4b5986bae5e5fd136de2f27b23be6 Mon Sep 17 00:00:00 2001
From: Richard Shadrach <45562402+rhshadrach@users.noreply.github.com>
Date: Tue, 18 Feb 2025 12:39:50 -0500
Subject: [PATCH 4/5] Backport PR #60940: ENH: Add dtype argument to str.decode

---
 doc/source/whatsnew/v2.3.0.rst       |  1 +
 pandas/core/strings/accessor.py      | 18 ++++++++++++++++--
 pandas/tests/strings/test_strings.py | 24 ++++++++++++++++++++++++
 3 files changed, 41 insertions(+), 2 deletions(-)

diff --git a/doc/source/whatsnew/v2.3.0.rst b/doc/source/whatsnew/v2.3.0.rst
index 0b7c2bac1be6a..c4e01a86ce843 100644
--- a/doc/source/whatsnew/v2.3.0.rst
+++ b/doc/source/whatsnew/v2.3.0.rst
@@ -37,6 +37,7 @@ Other enhancements
   updated to raise FutureWarning with NumPy >= 2 (:issue:`60340`)
 - :meth:`Series.str.decode` result now has ``StringDtype`` when ``future.infer_string`` is True (:issue:`60709`)
 - :meth:`~Series.to_hdf` and :meth:`~DataFrame.to_hdf` now round-trip with ``StringDtype``  (:issue:`60663`)
+- The :meth:`Series.str.decode` has gained the argument ``dtype`` to control the dtype of the result (:issue:`60940`)
 - The :meth:`~Series.cumsum`, :meth:`~Series.cummin`, and :meth:`~Series.cummax` reductions are now implemented for ``StringDtype`` columns when backed by PyArrow (:issue:`60633`)
 - The :meth:`~Series.sum` reduction is now implemented for ``StringDtype`` columns (:issue:`59853`)
 
diff --git a/pandas/core/strings/accessor.py b/pandas/core/strings/accessor.py
index 3c4cf60ab262a..c0e458f7968e7 100644
--- a/pandas/core/strings/accessor.py
+++ b/pandas/core/strings/accessor.py
@@ -33,6 +33,7 @@
     is_list_like,
     is_object_dtype,
     is_re,
+    is_string_dtype,
 )
 from pandas.core.dtypes.dtypes import (
     ArrowDtype,
@@ -1981,7 +1982,9 @@ def slice_replace(self, start=None, stop=None, repl=None):
         result = self._data.array._str_slice_replace(start, stop, repl)
         return self._wrap_result(result)
 
-    def decode(self, encoding, errors: str = "strict"):
+    def decode(
+        self, encoding, errors: str = "strict", dtype: str | DtypeObj | None = None
+    ):
         """
         Decode character string in the Series/Index using indicated encoding.
 
@@ -1992,6 +1995,14 @@ def decode(self, encoding, errors: str = "strict"):
         ----------
         encoding : str
         errors : str, optional
+            Specifies the error handling scheme.
+            Possible values are those supported by :meth:`bytes.decode`.
+        dtype : str or dtype, optional
+            The dtype of the result. When not ``None``, must be either a string or
+            object dtype. When ``None``, the dtype of the result is determined by
+            ``pd.options.future.infer_string``.
+
+            .. versionadded:: 2.3.0
 
         Returns
         -------
@@ -2008,6 +2019,10 @@ def decode(self, encoding, errors: str = "strict"):
         2   ()
         dtype: object
         """
+        if dtype is not None and not is_string_dtype(dtype):
+            raise ValueError(f"dtype must be string or object, got {dtype=}")
+        if dtype is None and get_option("future.infer_string"):
+            dtype = "str"
         # TODO: Add a similar _bytes interface.
         if encoding in _cpython_optimized_decoders:
             # CPython optimized implementation
@@ -2017,7 +2032,6 @@ def decode(self, encoding, errors: str = "strict"):
             f = lambda x: decoder(x, errors)[0]
         arr = self._data.array
         result = arr._str_map(f)
-        dtype = "str" if get_option("future.infer_string") else None
         return self._wrap_result(result, dtype=dtype)
 
     @forbid_nonstring_types(["bytes"])
diff --git a/pandas/tests/strings/test_strings.py b/pandas/tests/strings/test_strings.py
index 59a06a421f53e..c729b910d05a7 100644
--- a/pandas/tests/strings/test_strings.py
+++ b/pandas/tests/strings/test_strings.py
@@ -599,6 +599,30 @@ def test_decode_errors_kwarg():
     tm.assert_series_equal(result, expected)
 
 
+def test_decode_string_dtype(string_dtype):
+    # https://github.com/pandas-dev/pandas/pull/60940
+    ser = Series([b"a", b"b"])
+    result = ser.str.decode("utf-8", dtype=string_dtype)
+    expected = Series(["a", "b"], dtype=string_dtype)
+    tm.assert_series_equal(result, expected)
+
+
+def test_decode_object_dtype(object_dtype):
+    # https://github.com/pandas-dev/pandas/pull/60940
+    ser = Series([b"a", rb"\ud800"])
+    result = ser.str.decode("utf-8", dtype=object_dtype)
+    expected = Series(["a", r"\ud800"], dtype=object_dtype)
+    tm.assert_series_equal(result, expected)
+
+
+def test_decode_bad_dtype():
+    # https://github.com/pandas-dev/pandas/pull/60940
+    ser = Series([b"a", b"b"])
+    msg = "dtype must be string or object, got dtype='int64'"
+    with pytest.raises(ValueError, match=msg):
+        ser.str.decode("utf-8", dtype="int64")
+
+
 @pytest.mark.parametrize(
     "form, expected",
     [

From 5e5db791fe78466d98d1da76d0ee5b9afbb61c21 Mon Sep 17 00:00:00 2001
From: Richard Shadrach <45562402+rhshadrach@users.noreply.github.com>
Date: Tue, 18 Feb 2025 21:35:41 -0500
Subject: [PATCH 5/5] Backport PR #60938: ENH(string dtype): Implement cumsum
 for Python-backed strings

---
 doc/source/whatsnew/v2.3.0.rst         |  2 +-
 pandas/core/arrays/string_.py          | 83 ++++++++++++++++++++++++++
 pandas/tests/apply/test_str.py         | 15 +----
 pandas/tests/extension/test_string.py  |  6 +-
 pandas/tests/series/test_cumulative.py | 11 ++--
 5 files changed, 92 insertions(+), 25 deletions(-)

diff --git a/doc/source/whatsnew/v2.3.0.rst b/doc/source/whatsnew/v2.3.0.rst
index c4e01a86ce843..db3dcb50bacd0 100644
--- a/doc/source/whatsnew/v2.3.0.rst
+++ b/doc/source/whatsnew/v2.3.0.rst
@@ -38,7 +38,7 @@ Other enhancements
 - :meth:`Series.str.decode` result now has ``StringDtype`` when ``future.infer_string`` is True (:issue:`60709`)
 - :meth:`~Series.to_hdf` and :meth:`~DataFrame.to_hdf` now round-trip with ``StringDtype``  (:issue:`60663`)
 - The :meth:`Series.str.decode` has gained the argument ``dtype`` to control the dtype of the result (:issue:`60940`)
-- The :meth:`~Series.cumsum`, :meth:`~Series.cummin`, and :meth:`~Series.cummax` reductions are now implemented for ``StringDtype`` columns when backed by PyArrow (:issue:`60633`)
+- The :meth:`~Series.cumsum`, :meth:`~Series.cummin`, and :meth:`~Series.cummax` reductions are now implemented for ``StringDtype`` columns (:issue:`60633`)
 - The :meth:`~Series.sum` reduction is now implemented for ``StringDtype`` columns (:issue:`59853`)
 
 .. ---------------------------------------------------------------------------
diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py
index 3efb48c86e92c..c1048e806ff9a 100644
--- a/pandas/core/arrays/string_.py
+++ b/pandas/core/arrays/string_.py
@@ -46,6 +46,7 @@
 )
 
 from pandas.core import (
+    missing,
     nanops,
     ops,
 )
@@ -865,6 +866,88 @@ def _reduce(
             return result
         raise TypeError(f"Cannot perform reduction '{name}' with string dtype")
 
+    def _accumulate(self, name: str, *, skipna: bool = True, **kwargs) -> StringArray:
+        """
+        Return an ExtensionArray performing an accumulation operation.
+
+        The underlying data type might change.
+
+        Parameters
+        ----------
+        name : str
+            Name of the function, supported values are:
+            - cummin
+            - cummax
+            - cumsum
+            - cumprod
+        skipna : bool, default True
+            If True, skip NA values.
+        **kwargs
+            Additional keyword arguments passed to the accumulation function.
+            Currently, there is no supported kwarg.
+
+        Returns
+        -------
+        array
+
+        Raises
+        ------
+        NotImplementedError : subclass does not define accumulations
+        """
+        if name == "cumprod":
+            msg = f"operation '{name}' not supported for dtype '{self.dtype}'"
+            raise TypeError(msg)
+
+        # We may need to strip out trailing NA values
+        tail: np.ndarray | None = None
+        na_mask: np.ndarray | None = None
+        ndarray = self._ndarray
+        np_func = {
+            "cumsum": np.cumsum,
+            "cummin": np.minimum.accumulate,
+            "cummax": np.maximum.accumulate,
+        }[name]
+
+        if self._hasna:
+            na_mask = cast("npt.NDArray[np.bool_]", isna(ndarray))
+            if np.all(na_mask):
+                return type(self)(ndarray)
+            if skipna:
+                if name == "cumsum":
+                    ndarray = np.where(na_mask, "", ndarray)
+                else:
+                    # We can retain the running min/max by forward/backward filling.
+                    ndarray = ndarray.copy()
+                    missing.pad_or_backfill_inplace(
+                        ndarray,
+                        method="pad",
+                        axis=0,
+                    )
+                    missing.pad_or_backfill_inplace(
+                        ndarray,
+                        method="backfill",
+                        axis=0,
+                    )
+            else:
+                # When not skipping NA values, the result should be null from
+                # the first NA value onward.
+                idx = np.argmax(na_mask)
+                tail = np.empty(len(ndarray) - idx, dtype="object")
+                tail[:] = self.dtype.na_value
+                ndarray = ndarray[:idx]
+
+        # mypy: Cannot call function of unknown type
+        np_result = np_func(ndarray)  # type: ignore[operator]
+
+        if tail is not None:
+            np_result = np.hstack((np_result, tail))
+        elif na_mask is not None:
+            # Argument 2 to "where" has incompatible type "NAType | float"
+            np_result = np.where(na_mask, self.dtype.na_value, np_result)  # type: ignore[arg-type]
+
+        result = type(self)(np_result)
+        return result
+
     def _wrap_reduction_result(self, axis: AxisInt | None, result) -> Any:
         if self.dtype.na_value is np.nan and result is libmissing.NA:
             # the masked_reductions use pd.NA -> convert to np.nan
diff --git a/pandas/tests/apply/test_str.py b/pandas/tests/apply/test_str.py
index 9c7836a0aa167..17e8322dc40e1 100644
--- a/pandas/tests/apply/test_str.py
+++ b/pandas/tests/apply/test_str.py
@@ -4,8 +4,6 @@
 import numpy as np
 import pytest
 
-from pandas.compat import HAS_PYARROW
-
 from pandas.core.dtypes.common import is_number
 
 from pandas import (
@@ -168,21 +166,10 @@ def test_agg_cython_table_series(series, func, expected):
         ),
     ),
 )
-def test_agg_cython_table_transform_series(request, series, func, expected):
+def test_agg_cython_table_transform_series(series, func, expected):
     # GH21224
     # test transforming functions in
     # pandas.core.base.SelectionMixin._cython_table (cumprod, cumsum)
-    if (
-        series.dtype == "string"
-        and func in ("cumsum", np.cumsum, np.nancumsum)
-        and not HAS_PYARROW
-    ):
-        request.applymarker(
-            pytest.mark.xfail(
-                raises=NotImplementedError,
-                reason="TODO(infer_string) cumsum not yet implemented for string",
-            )
-        )
     warn = None if isinstance(func, str) else FutureWarning
     with tm.assert_produces_warning(warn, match="is currently using Series.*"):
         result = series.agg(func)
diff --git a/pandas/tests/extension/test_string.py b/pandas/tests/extension/test_string.py
index 301c7ee851aa0..526cf426781ad 100644
--- a/pandas/tests/extension/test_string.py
+++ b/pandas/tests/extension/test_string.py
@@ -200,11 +200,7 @@ def _supports_reduction(self, ser: pd.Series, op_name: str) -> bool:
 
     def _supports_accumulation(self, ser: pd.Series, op_name: str) -> bool:
         assert isinstance(ser.dtype, StorageExtensionDtype)
-        return ser.dtype.storage == "pyarrow" and op_name in [
-            "cummin",
-            "cummax",
-            "cumsum",
-        ]
+        return op_name in ["cummin", "cummax", "cumsum"]
 
     def _cast_pointwise_result(self, op_name: str, obj, other, pointwise_result):
         dtype = cast(StringDtype, tm.get_dtype(obj))
diff --git a/pandas/tests/series/test_cumulative.py b/pandas/tests/series/test_cumulative.py
index 0dc391db2182b..97f5fb4a9f96f 100644
--- a/pandas/tests/series/test_cumulative.py
+++ b/pandas/tests/series/test_cumulative.py
@@ -193,13 +193,14 @@ def test_cumprod_timedelta(self):
             ([pd.NA, pd.NA, pd.NA], "cummax", False, [pd.NA, pd.NA, pd.NA]),
         ],
     )
-    def test_cum_methods_pyarrow_strings(
-        self, pyarrow_string_dtype, data, op, skipna, expected_data
+    def test_cum_methods_ea_strings(
+        self, string_dtype_no_object, data, op, skipna, expected_data
     ):
-        # https://github.com/pandas-dev/pandas/pull/60633
-        ser = pd.Series(data, dtype=pyarrow_string_dtype)
+        # https://github.com/pandas-dev/pandas/pull/60633 - pyarrow
+        # https://github.com/pandas-dev/pandas/pull/60938 - Python
+        ser = pd.Series(data, dtype=string_dtype_no_object)
         method = getattr(ser, op)
-        expected = pd.Series(expected_data, dtype=pyarrow_string_dtype)
+        expected = pd.Series(expected_data, dtype=string_dtype_no_object)
         result = method(skipna=skipna)
         tm.assert_series_equal(result, expected)