From b6b60370c625051fb89254a8c7316bb8b0659cb0 Mon Sep 17 00:00:00 2001 From: Jake Thomas Trevallion <136272202+JakeTT404@users.noreply.github.com> Date: Wed, 5 Feb 2025 17:48:56 +0000 Subject: [PATCH 1/5] ENH: Improved error message and raise new error for small-string NaN edge case in HDFStore.append (#60829) * Add clearer error messages for datatype mismatch in HDFStore.append. Raise ValueError when nan_rep too large for pytable column. Add and modify applicable test code. * Fix missed tests and correct mistake in error message. * Remove excess comments. Reverse error type change to avoid api changes. Move nan_rep tests into separate function. (cherry picked from commit 57340ecd08580f26ee4a976c1f68b2f563c41569) --- pandas/io/pytables.py | 9 ++++++ pandas/tests/io/pytables/test_append.py | 35 +++++++++++++++++---- pandas/tests/io/pytables/test_round_trip.py | 9 ++---- 3 files changed, 41 insertions(+), 12 deletions(-) diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index dbe2db9f9625b..6d5202c58a0d5 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -3464,6 +3464,12 @@ def validate(self, other) -> None: # Value of type "Optional[Any]" is not indexable [index] oax = ov[i] # type: ignore[index] if sax != oax: + if c == "values_axes" and sax.kind != oax.kind: + raise ValueError( + f"Cannot serialize the column [{oax.values[0]}] " + f"because its data contents are not [{sax.kind}] " + f"but [{oax.kind}] object dtype" + ) raise ValueError( f"invalid combination of [{c}] on appending data " f"[{sax}] vs current table [{oax}]" @@ -5111,6 +5117,9 @@ def _maybe_convert_for_string_atom( data = bvalues.copy() data[mask] = nan_rep + if existing_col and mask.any() and len(nan_rep) > existing_col.itemsize: + raise ValueError("NaN representation is too large for existing column size") + # see if we have a valid string type inferred_type = lib.infer_dtype(data, skipna=False) if inferred_type != "string": diff --git a/pandas/tests/io/pytables/test_append.py b/pandas/tests/io/pytables/test_append.py index 93e50455fe6a2..fd2deacb69b3c 100644 --- a/pandas/tests/io/pytables/test_append.py +++ b/pandas/tests/io/pytables/test_append.py @@ -818,12 +818,9 @@ def test_append_raise(setup_path): store.append("df", df) df["foo"] = "bar" msg = re.escape( - "invalid combination of [values_axes] on appending data " - "[name->values_block_1,cname->values_block_1," - "dtype->bytes24,kind->string,shape->(1, 30)] " - "vs current table " - "[name->values_block_1,cname->values_block_1," - "dtype->datetime64[s],kind->datetime64[s],shape->None]" + "Cannot serialize the column [foo] " + "because its data contents are not [string] " + "but [datetime64[s]] object dtype" ) with pytest.raises(ValueError, match=msg): store.append("df", df) @@ -989,3 +986,29 @@ def test_append_to_multiple_min_itemsize(setup_path): ) result = store.select_as_multiple(["index", "nums", "strs"]) tm.assert_frame_equal(result, expected, check_index_type=True) + + +def test_append_string_nan_rep(setup_path): + # GH 16300 + df = DataFrame({"A": "a", "B": "foo"}, index=np.arange(10)) + df_nan = df.copy() + df_nan.loc[0:4, :] = np.nan + msg = "NaN representation is too large for existing column size" + + with ensure_clean_store(setup_path) as store: + # string column too small + store.append("sa", df["A"]) + with pytest.raises(ValueError, match=msg): + store.append("sa", df_nan["A"]) + + # nan_rep too big + store.append("sb", df["B"], nan_rep="bars") + with pytest.raises(ValueError, match=msg): + store.append("sb", df_nan["B"]) + + # smaller modified nan_rep + store.append("sc", df["A"], nan_rep="n") + store.append("sc", df_nan["A"]) + result = store["sc"] + expected = concat([df["A"], df_nan["A"]]) + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/io/pytables/test_round_trip.py b/pandas/tests/io/pytables/test_round_trip.py index 2397d18b1019e..72d90b1273d65 100644 --- a/pandas/tests/io/pytables/test_round_trip.py +++ b/pandas/tests/io/pytables/test_round_trip.py @@ -213,12 +213,9 @@ def test_table_values_dtypes_roundtrip(setup_path): # incompatible dtype msg = re.escape( - "invalid combination of [values_axes] on appending data " - "[name->values_block_0,cname->values_block_0," - "dtype->float64,kind->float,shape->(1, 3)] vs " - "current table [name->values_block_0," - "cname->values_block_0,dtype->int64,kind->integer," - "shape->None]" + "Cannot serialize the column [a] " + "because its data contents are not [float] " + "but [integer] object dtype" ) with pytest.raises(ValueError, match=msg): store.append("df_i8", df1) From 413563e614998262ff914c4e6dca65ebe32b048f Mon Sep 17 00:00:00 2001 From: Richard Shadrach <45562402+rhshadrach@users.noreply.github.com> Date: Mon, 10 Feb 2025 09:23:52 -0500 Subject: [PATCH 2/5] TST(string dtype): Resolve xfails in pytables (#60795) (cherry picked from commit 4511251ccf409f2ba71cab0283bdf751697ee539) --- pandas/io/pytables.py | 3 + pandas/tests/io/pytables/test_append.py | 56 +++++++++------- pandas/tests/io/pytables/test_categorical.py | 6 +- pandas/tests/io/pytables/test_complex.py | 6 -- pandas/tests/io/pytables/test_errors.py | 18 ++--- .../tests/io/pytables/test_file_handling.py | 10 +-- pandas/tests/io/pytables/test_keys.py | 7 +- pandas/tests/io/pytables/test_put.py | 4 +- pandas/tests/io/pytables/test_read.py | 16 +++-- pandas/tests/io/pytables/test_round_trip.py | 49 ++++++++------ pandas/tests/io/pytables/test_select.py | 44 ++++++------ pandas/tests/io/pytables/test_store.py | 67 ++++++++++--------- pandas/tests/io/pytables/test_timezones.py | 6 -- 13 files changed, 143 insertions(+), 149 deletions(-) diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 6d5202c58a0d5..d93a3f26934a0 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -5093,6 +5093,9 @@ def _maybe_convert_for_string_atom( errors, columns: list[str], ): + if isinstance(bvalues.dtype, StringDtype): + # "ndarray[Any, Any]" has no attribute "to_numpy" + bvalues = bvalues.to_numpy() # type: ignore[union-attr] if bvalues.dtype != object: return bvalues diff --git a/pandas/tests/io/pytables/test_append.py b/pandas/tests/io/pytables/test_append.py index fd2deacb69b3c..39c203c558a5b 100644 --- a/pandas/tests/io/pytables/test_append.py +++ b/pandas/tests/io/pytables/test_append.py @@ -25,10 +25,7 @@ ensure_clean_store, ) -pytestmark = [ - pytest.mark.single_cpu, - pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False), -] +pytestmark = [pytest.mark.single_cpu] tables = pytest.importorskip("tables") @@ -40,7 +37,7 @@ def test_append(setup_path): # tables.NaturalNameWarning): df = DataFrame( np.random.default_rng(2).standard_normal((20, 4)), - columns=Index(list("ABCD"), dtype=object), + columns=Index(list("ABCD")), index=date_range("2000-01-01", periods=20, freq="B"), ) _maybe_remove(store, "df1") @@ -201,7 +198,7 @@ def test_append_some_nans(setup_path): tm.assert_frame_equal(store["df3"], df3, check_index_type=True) -def test_append_all_nans(setup_path): +def test_append_all_nans(setup_path, using_infer_string): with ensure_clean_store(setup_path) as store: df = DataFrame( { @@ -253,7 +250,13 @@ def test_append_all_nans(setup_path): _maybe_remove(store, "df") store.append("df", df[:10], dropna=True) store.append("df", df[10:], dropna=True) - tm.assert_frame_equal(store["df"], df, check_index_type=True) + result = store["df"] + expected = df + if using_infer_string: + # TODO: Test is incorrect when not using_infer_string. + # Should take the last 4 rows uncondiationally. + expected = expected[-4:] + tm.assert_frame_equal(result, expected, check_index_type=True) _maybe_remove(store, "df2") store.append("df2", df[:10], dropna=False) @@ -292,7 +295,7 @@ def test_append_frame_column_oriented(setup_path): # column oriented df = DataFrame( np.random.default_rng(2).standard_normal((10, 4)), - columns=Index(list("ABCD"), dtype=object), + columns=Index(list("ABCD")), index=date_range("2000-01-01", periods=10, freq="B"), ) df.index = df.index._with_freq(None) # freq doesn't round-trip @@ -417,7 +420,7 @@ def check_col(key, name, size): { "A": [0.0, 1.0, 2.0, 3.0, 4.0], "B": [0.0, 1.0, 0.0, 1.0, 0.0], - "C": Index(["foo1", "foo2", "foo3", "foo4", "foo5"], dtype=object), + "C": Index(["foo1", "foo2", "foo3", "foo4", "foo5"]), "D": date_range("20130101", periods=5), } ).set_index("C") @@ -444,7 +447,7 @@ def check_col(key, name, size): _maybe_remove(store, "df") df = DataFrame( np.random.default_rng(2).standard_normal((10, 4)), - columns=Index(list("ABCD"), dtype=object), + columns=Index(list("ABCD")), index=date_range("2000-01-01", periods=10, freq="B"), ) df["string"] = "foo" @@ -504,11 +507,12 @@ def test_append_with_empty_string(setup_path): tm.assert_frame_equal(store.select("df"), df) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_append_with_data_columns(setup_path): with ensure_clean_store(setup_path) as store: df = DataFrame( np.random.default_rng(2).standard_normal((10, 4)), - columns=Index(list("ABCD"), dtype=object), + columns=Index(list("ABCD")), index=date_range("2000-01-01", periods=10, freq="B"), ) df.iloc[0, df.columns.get_loc("B")] = 1.0 @@ -684,8 +688,8 @@ def test_append_misc(setup_path): with ensure_clean_store(setup_path) as store: df = DataFrame( 1.1 * np.arange(120).reshape((30, 4)), - columns=Index(list("ABCD"), dtype=object), - index=Index([f"i-{i}" for i in range(30)], dtype=object), + columns=Index(list("ABCD")), + index=Index([f"i-{i}" for i in range(30)]), ) store.append("df", df, chunksize=1) result = store.select("df") @@ -701,8 +705,8 @@ def test_append_misc_chunksize(setup_path, chunksize): # more chunksize in append tests df = DataFrame( 1.1 * np.arange(120).reshape((30, 4)), - columns=Index(list("ABCD"), dtype=object), - index=Index([f"i-{i}" for i in range(30)], dtype=object), + columns=Index(list("ABCD")), + index=Index([f"i-{i}" for i in range(30)]), ) df["string"] = "foo" df["float322"] = 1.0 @@ -742,15 +746,15 @@ def test_append_misc_empty_frame(setup_path): # the conversion from AM->BM converts the invalid object dtype column into # a datetime64 column no longer raising an error @td.skip_array_manager_not_yet_implemented -def test_append_raise(setup_path): +def test_append_raise(setup_path, using_infer_string): with ensure_clean_store(setup_path) as store: # test append with invalid input to get good error messages # list in column df = DataFrame( 1.1 * np.arange(120).reshape((30, 4)), - columns=Index(list("ABCD"), dtype=object), - index=Index([f"i-{i}" for i in range(30)], dtype=object), + columns=Index(list("ABCD")), + index=Index([f"i-{i}" for i in range(30)]), ) df["invalid"] = [["a"]] * len(df) assert df.dtypes["invalid"] == np.object_ @@ -770,8 +774,8 @@ def test_append_raise(setup_path): # datetime with embedded nans as object df = DataFrame( 1.1 * np.arange(120).reshape((30, 4)), - columns=Index(list("ABCD"), dtype=object), - index=Index([f"i-{i}" for i in range(30)], dtype=object), + columns=Index(list("ABCD")), + index=Index([f"i-{i}" for i in range(30)]), ) s = Series(datetime.datetime(2001, 1, 2), index=df.index) s = s.astype(object) @@ -798,8 +802,8 @@ def test_append_raise(setup_path): # appending an incompatible table df = DataFrame( 1.1 * np.arange(120).reshape((30, 4)), - columns=Index(list("ABCD"), dtype=object), - index=Index([f"i-{i}" for i in range(30)], dtype=object), + columns=Index(list("ABCD")), + index=Index([f"i-{i}" for i in range(30)]), ) store.append("df", df) @@ -876,7 +880,7 @@ def test_append_with_timedelta(setup_path): def test_append_to_multiple(setup_path): df1 = DataFrame( np.random.default_rng(2).standard_normal((10, 4)), - columns=Index(list("ABCD"), dtype=object), + columns=Index(list("ABCD")), index=date_range("2000-01-01", periods=10, freq="B"), ) df2 = df1.copy().rename(columns="{}_2".format) @@ -913,12 +917,12 @@ def test_append_to_multiple(setup_path): def test_append_to_multiple_dropna(setup_path): df1 = DataFrame( np.random.default_rng(2).standard_normal((10, 4)), - columns=Index(list("ABCD"), dtype=object), + columns=Index(list("ABCD")), index=date_range("2000-01-01", periods=10, freq="B"), ) df2 = DataFrame( np.random.default_rng(2).standard_normal((10, 4)), - columns=Index(list("ABCD"), dtype=object), + columns=Index(list("ABCD")), index=date_range("2000-01-01", periods=10, freq="B"), ).rename(columns="{}_2".format) df1.iloc[1, df1.columns.get_indexer(["A", "B"])] = np.nan @@ -938,7 +942,7 @@ def test_append_to_multiple_dropna(setup_path): def test_append_to_multiple_dropna_false(setup_path): df1 = DataFrame( np.random.default_rng(2).standard_normal((10, 4)), - columns=Index(list("ABCD"), dtype=object), + columns=Index(list("ABCD")), index=date_range("2000-01-01", periods=10, freq="B"), ) df2 = df1.copy().rename(columns="{}_2".format) diff --git a/pandas/tests/io/pytables/test_categorical.py b/pandas/tests/io/pytables/test_categorical.py index 07c797467e5e2..a875e19ea7f0e 100644 --- a/pandas/tests/io/pytables/test_categorical.py +++ b/pandas/tests/io/pytables/test_categorical.py @@ -16,10 +16,7 @@ ensure_clean_store, ) -pytestmark = [ - pytest.mark.single_cpu, - pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False), -] +pytestmark = [pytest.mark.single_cpu] def test_categorical(setup_path): @@ -143,6 +140,7 @@ def test_categorical(setup_path): store.select("df3/meta/s/meta") +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_categorical_conversion(tmp_path, setup_path): # GH13322 # Check that read_hdf with categorical columns doesn't return rows if diff --git a/pandas/tests/io/pytables/test_complex.py b/pandas/tests/io/pytables/test_complex.py index d140cfc941e16..c5cac5a5caf09 100644 --- a/pandas/tests/io/pytables/test_complex.py +++ b/pandas/tests/io/pytables/test_complex.py @@ -1,8 +1,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - import pandas as pd from pandas import ( DataFrame, @@ -13,10 +11,6 @@ from pandas.io.pytables import read_hdf -pytestmark = pytest.mark.xfail( - using_string_dtype(), reason="TODO(infer_string)", strict=False -) - def test_complex_fixed(tmp_path, setup_path): df = DataFrame( diff --git a/pandas/tests/io/pytables/test_errors.py b/pandas/tests/io/pytables/test_errors.py index c31b9989ef35e..b28101c09820f 100644 --- a/pandas/tests/io/pytables/test_errors.py +++ b/pandas/tests/io/pytables/test_errors.py @@ -5,8 +5,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas import ( CategoricalIndex, DataFrame, @@ -24,10 +22,7 @@ _maybe_adjust_name, ) -pytestmark = [ - pytest.mark.single_cpu, - pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False), -] +pytestmark = [pytest.mark.single_cpu] def test_pass_spec_to_storer(setup_path): @@ -93,9 +88,14 @@ def test_unimplemented_dtypes_table_columns(setup_path): with ensure_clean_store(setup_path) as store: # this fails because we have a date in the object block...... - msg = re.escape( - """Cannot serialize the column [datetime1] -because its data contents are not [string] but [date] object dtype""" + msg = "|".join( + [ + re.escape( + "Cannot serialize the column [datetime1]\nbecause its data " + "contents are not [string] but [date] object dtype" + ), + re.escape("[date] is not implemented as a table column"), + ] ) with pytest.raises(TypeError, match=msg): store.append("df_unimplemented", df) diff --git a/pandas/tests/io/pytables/test_file_handling.py b/pandas/tests/io/pytables/test_file_handling.py index 1878f2a392e13..100a55e6e346d 100644 --- a/pandas/tests/io/pytables/test_file_handling.py +++ b/pandas/tests/io/pytables/test_file_handling.py @@ -3,8 +3,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas.compat import ( PY311, is_ci_environment, @@ -34,9 +32,7 @@ from pandas.io import pytables from pandas.io.pytables import Term -pytestmark = [ - pytest.mark.single_cpu, -] +pytestmark = [pytest.mark.single_cpu] @pytest.mark.parametrize("mode", ["r", "r+", "a", "w"]) @@ -323,7 +319,6 @@ def test_complibs(tmp_path, lvl, lib, request): assert node.filters.complib == lib -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.skipif( not is_platform_little_endian(), reason="reason platform is not little endian" ) @@ -341,7 +336,6 @@ def test_encoding(setup_path): tm.assert_frame_equal(result, expected) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize( "val", [ @@ -356,7 +350,7 @@ def test_encoding(setup_path): [b"A\xf8\xfc", np.nan, b"", b"b", b"c"], ], ) -@pytest.mark.parametrize("dtype", ["category", object]) +@pytest.mark.parametrize("dtype", ["category", None]) def test_latin_encoding(tmp_path, setup_path, dtype, val): enc = "latin-1" nan_rep = "" diff --git a/pandas/tests/io/pytables/test_keys.py b/pandas/tests/io/pytables/test_keys.py index 7d0802dcf2e47..9c5fc8786c7c6 100644 --- a/pandas/tests/io/pytables/test_keys.py +++ b/pandas/tests/io/pytables/test_keys.py @@ -1,8 +1,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas import ( DataFrame, HDFStore, @@ -15,10 +13,7 @@ tables, ) -pytestmark = [ - pytest.mark.single_cpu, - pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False), -] +pytestmark = [pytest.mark.single_cpu] def test_keys(setup_path): diff --git a/pandas/tests/io/pytables/test_put.py b/pandas/tests/io/pytables/test_put.py index 38f0379eb9a66..36ca68eb227a6 100644 --- a/pandas/tests/io/pytables/test_put.py +++ b/pandas/tests/io/pytables/test_put.py @@ -22,9 +22,7 @@ ) from pandas.util import _test_decorators as td -pytestmark = [ - pytest.mark.single_cpu, -] +pytestmark = [pytest.mark.single_cpu] def test_format_type(tmp_path, setup_path): diff --git a/pandas/tests/io/pytables/test_read.py b/pandas/tests/io/pytables/test_read.py index 28cd8aea1defc..bfebf18c0e0ab 100644 --- a/pandas/tests/io/pytables/test_read.py +++ b/pandas/tests/io/pytables/test_read.py @@ -28,10 +28,7 @@ from pandas.io.pytables import TableIterator -pytestmark = [ - pytest.mark.single_cpu, - pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False), -] +pytestmark = [pytest.mark.single_cpu] def test_read_missing_key_close_store(tmp_path, setup_path): @@ -77,10 +74,11 @@ def test_read_missing_key_opened_store(tmp_path, setup_path): read_hdf(store, "k1") +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_read_column(setup_path): df = DataFrame( np.random.default_rng(2).standard_normal((10, 4)), - columns=Index(list("ABCD"), dtype=object), + columns=Index(list("ABCD")), index=date_range("2000-01-01", periods=10, freq="B"), ) @@ -221,7 +219,7 @@ def test_legacy_table_read_py2(datapath): tm.assert_frame_equal(expected, result) -def test_read_hdf_open_store(tmp_path, setup_path): +def test_read_hdf_open_store(tmp_path, setup_path, using_infer_string): # GH10330 # No check for non-string path_or-buf, and no test of open store df = DataFrame( @@ -233,6 +231,12 @@ def test_read_hdf_open_store(tmp_path, setup_path): df = df.set_index(keys="E", append=True) path = tmp_path / setup_path + if using_infer_string: + # TODO(infer_string) make this work for string dtype + msg = "Saving a MultiIndex with an extension dtype is not supported." + with pytest.raises(NotImplementedError, match=msg): + df.to_hdf(path, key="df", mode="w") + return df.to_hdf(path, key="df", mode="w") direct = read_hdf(path, "df") with HDFStore(path, mode="r") as store: diff --git a/pandas/tests/io/pytables/test_round_trip.py b/pandas/tests/io/pytables/test_round_trip.py index 72d90b1273d65..040708c9cedd0 100644 --- a/pandas/tests/io/pytables/test_round_trip.py +++ b/pandas/tests/io/pytables/test_round_trip.py @@ -4,8 +4,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas._libs.tslibs import Timestamp from pandas.compat import is_platform_windows @@ -26,10 +24,7 @@ ) from pandas.util import _test_decorators as td -pytestmark = [ - pytest.mark.single_cpu, - pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False), -] +pytestmark = [pytest.mark.single_cpu] def test_conv_read_write(): @@ -49,8 +44,8 @@ def roundtrip(key, obj, **kwargs): o = DataFrame( 1.1 * np.arange(120).reshape((30, 4)), - columns=Index(list("ABCD"), dtype=object), - index=Index([f"i-{i}" for i in range(30)], dtype=object), + columns=Index(list("ABCD")), + index=Index([f"i-{i}" for i in range(30)]), ) tm.assert_frame_equal(o, roundtrip("frame", o)) @@ -150,8 +145,8 @@ def test_api_invalid(tmp_path, setup_path): # Invalid. df = DataFrame( 1.1 * np.arange(120).reshape((30, 4)), - columns=Index(list("ABCD"), dtype=object), - index=Index([f"i-{i}" for i in range(30)], dtype=object), + columns=Index(list("ABCD")), + index=Index([f"i-{i}" for i in range(30)]), ) msg = "Can only append to Tables" @@ -201,7 +196,7 @@ def test_put_integer(setup_path): _check_roundtrip(df, tm.assert_frame_equal, setup_path) -def test_table_values_dtypes_roundtrip(setup_path): +def test_table_values_dtypes_roundtrip(setup_path, using_infer_string): with ensure_clean_store(setup_path) as store: df1 = DataFrame({"a": [1, 2, 3]}, dtype="f8") store.append("df_f8", df1) @@ -244,6 +239,7 @@ def test_table_values_dtypes_roundtrip(setup_path): store.append("df_mixed_dtypes1", df1) result = store.select("df_mixed_dtypes1").dtypes.value_counts() result.index = [str(i) for i in result.index] + str_dtype = "str" if using_infer_string else "object" expected = Series( { "float32": 2, @@ -253,7 +249,7 @@ def test_table_values_dtypes_roundtrip(setup_path): "int16": 1, "int8": 1, "int64": 1, - "object": 1, + str_dtype: 1, "datetime64[ns]": 2, }, name="count", @@ -273,10 +269,10 @@ def test_series(setup_path): ) _check_roundtrip(ts, tm.assert_series_equal, path=setup_path) - ts2 = Series(ts.index, Index(ts.index, dtype=object)) + ts2 = Series(ts.index, Index(ts.index)) _check_roundtrip(ts2, tm.assert_series_equal, path=setup_path) - ts3 = Series(ts.values, Index(np.asarray(ts.index, dtype=object), dtype=object)) + ts3 = Series(ts.values, Index(np.asarray(ts.index))) _check_roundtrip( ts3, tm.assert_series_equal, path=setup_path, check_index_type=False ) @@ -366,8 +362,8 @@ def test_timeseries_preepoch(setup_path, request): def test_frame(compression, setup_path): df = DataFrame( 1.1 * np.arange(120).reshape((30, 4)), - columns=Index(list("ABCD"), dtype=object), - index=Index([f"i-{i}" for i in range(30)], dtype=object), + columns=Index(list("ABCD")), + index=Index([f"i-{i}" for i in range(30)]), ) # put in some random NAs @@ -383,7 +379,7 @@ def test_frame(compression, setup_path): tdf = DataFrame( np.random.default_rng(2).standard_normal((10, 4)), - columns=Index(list("ABCD"), dtype=object), + columns=Index(list("ABCD")), index=date_range("2000-01-01", periods=10, freq="B"), ) _check_roundtrip( @@ -398,7 +394,10 @@ def test_frame(compression, setup_path): assert recons._mgr.is_consolidated() # empty - _check_roundtrip(df[:0], tm.assert_frame_equal, path=setup_path) + df2 = df[:0] + # Prevent df2 from having index with inferred_type as string + df2.index = Index([]) + _check_roundtrip(df2[:0], tm.assert_frame_equal, path=setup_path) def test_empty_series_frame(setup_path): @@ -430,9 +429,17 @@ def test_can_serialize_dates(setup_path): _check_roundtrip(frame, tm.assert_frame_equal, path=setup_path) -def test_store_hierarchical(setup_path, multiindex_dataframe_random_data): +def test_store_hierarchical( + setup_path, using_infer_string, multiindex_dataframe_random_data +): frame = multiindex_dataframe_random_data + if using_infer_string: + # TODO(infer_string) make this work for string dtype + msg = "Saving a MultiIndex with an extension dtype is not supported." + with pytest.raises(NotImplementedError, match=msg): + _check_roundtrip(frame, tm.assert_frame_equal, path=setup_path) + return _check_roundtrip(frame, tm.assert_frame_equal, path=setup_path) _check_roundtrip(frame.T, tm.assert_frame_equal, path=setup_path) _check_roundtrip(frame["A"], tm.assert_series_equal, path=setup_path) @@ -451,8 +458,8 @@ def test_store_mixed(compression, setup_path): def _make_one(): df = DataFrame( 1.1 * np.arange(120).reshape((30, 4)), - columns=Index(list("ABCD"), dtype=object), - index=Index([f"i-{i}" for i in range(30)], dtype=object), + columns=Index(list("ABCD")), + index=Index([f"i-{i}" for i in range(30)]), ) df["obj1"] = "foo" df["obj2"] = "bar" diff --git a/pandas/tests/io/pytables/test_select.py b/pandas/tests/io/pytables/test_select.py index 9f403f8293aed..f781b6756fec9 100644 --- a/pandas/tests/io/pytables/test_select.py +++ b/pandas/tests/io/pytables/test_select.py @@ -26,10 +26,7 @@ from pandas.io.pytables import Term -pytestmark = [ - pytest.mark.single_cpu, - pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False), -] +pytestmark = [pytest.mark.single_cpu] def test_select_columns_in_where(setup_path): @@ -137,7 +134,7 @@ def test_select(setup_path): # select with columns= df = DataFrame( np.random.default_rng(2).standard_normal((10, 4)), - columns=Index(list("ABCD"), dtype=object), + columns=Index(list("ABCD")), index=date_range("2000-01-01", periods=10, freq="B"), ) _maybe_remove(store, "df") @@ -277,8 +274,8 @@ def test_select_dtypes(setup_path): with ensure_clean_store(setup_path) as store: df = DataFrame( 1.1 * np.arange(120).reshape((30, 4)), - columns=Index(list("ABCD"), dtype=object), - index=Index([f"i-{i}" for i in range(30)], dtype=object), + columns=Index(list("ABCD")), + index=Index([f"i-{i}" for i in range(30)]), ) expected = df[df["A"] > 0] @@ -342,7 +339,7 @@ def test_select_iterator(tmp_path, setup_path): with ensure_clean_store(setup_path) as store: df = DataFrame( np.random.default_rng(2).standard_normal((10, 4)), - columns=Index(list("ABCD"), dtype=object), + columns=Index(list("ABCD")), index=date_range("2000-01-01", periods=10, freq="B"), ) _maybe_remove(store, "df") @@ -367,7 +364,7 @@ def test_select_iterator(tmp_path, setup_path): df = DataFrame( np.random.default_rng(2).standard_normal((10, 4)), - columns=Index(list("ABCD"), dtype=object), + columns=Index(list("ABCD")), index=date_range("2000-01-01", periods=10, freq="B"), ) df.to_hdf(path, key="df_non_table") @@ -383,7 +380,7 @@ def test_select_iterator(tmp_path, setup_path): df = DataFrame( np.random.default_rng(2).standard_normal((10, 4)), - columns=Index(list("ABCD"), dtype=object), + columns=Index(list("ABCD")), index=date_range("2000-01-01", periods=10, freq="B"), ) df.to_hdf(path, key="df", format="table") @@ -400,7 +397,7 @@ def test_select_iterator(tmp_path, setup_path): with ensure_clean_store(setup_path) as store: df1 = DataFrame( np.random.default_rng(2).standard_normal((10, 4)), - columns=Index(list("ABCD"), dtype=object), + columns=Index(list("ABCD")), index=date_range("2000-01-01", periods=10, freq="B"), ) store.append("df1", df1, data_columns=True) @@ -428,7 +425,7 @@ def test_select_iterator_complete_8014(setup_path): with ensure_clean_store(setup_path) as store: expected = DataFrame( np.random.default_rng(2).standard_normal((100064, 4)), - columns=Index(list("ABCD"), dtype=object), + columns=Index(list("ABCD")), index=date_range("2000-01-01", periods=100064, freq="s"), ) _maybe_remove(store, "df") @@ -463,7 +460,7 @@ def test_select_iterator_complete_8014(setup_path): with ensure_clean_store(setup_path) as store: expected = DataFrame( np.random.default_rng(2).standard_normal((100064, 4)), - columns=Index(list("ABCD"), dtype=object), + columns=Index(list("ABCD")), index=date_range("2000-01-01", periods=100064, freq="s"), ) _maybe_remove(store, "df") @@ -505,7 +502,7 @@ def test_select_iterator_non_complete_8014(setup_path): with ensure_clean_store(setup_path) as store: expected = DataFrame( np.random.default_rng(2).standard_normal((100064, 4)), - columns=Index(list("ABCD"), dtype=object), + columns=Index(list("ABCD")), index=date_range("2000-01-01", periods=100064, freq="s"), ) _maybe_remove(store, "df") @@ -539,7 +536,7 @@ def test_select_iterator_non_complete_8014(setup_path): with ensure_clean_store(setup_path) as store: expected = DataFrame( np.random.default_rng(2).standard_normal((100064, 4)), - columns=Index(list("ABCD"), dtype=object), + columns=Index(list("ABCD")), index=date_range("2000-01-01", periods=100064, freq="s"), ) _maybe_remove(store, "df") @@ -563,7 +560,7 @@ def test_select_iterator_many_empty_frames(setup_path): with ensure_clean_store(setup_path) as store: expected = DataFrame( np.random.default_rng(2).standard_normal((100064, 4)), - columns=Index(list("ABCD"), dtype=object), + columns=Index(list("ABCD")), index=date_range("2000-01-01", periods=100064, freq="s"), ) _maybe_remove(store, "df") @@ -615,7 +612,7 @@ def test_select_iterator_many_empty_frames(setup_path): def test_frame_select(setup_path): df = DataFrame( np.random.default_rng(2).standard_normal((10, 4)), - columns=Index(list("ABCD"), dtype=object), + columns=Index(list("ABCD")), index=date_range("2000-01-01", periods=10, freq="B"), ) @@ -640,7 +637,7 @@ def test_frame_select(setup_path): # invalid terms df = DataFrame( np.random.default_rng(2).standard_normal((10, 4)), - columns=Index(list("ABCD"), dtype=object), + columns=Index(list("ABCD")), index=date_range("2000-01-01", periods=10, freq="B"), ) store.append("df_time", df) @@ -654,12 +651,13 @@ def test_frame_select(setup_path): # store.select('frame', [crit1, crit2]) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_frame_select_complex(setup_path): # select via complex criteria df = DataFrame( np.random.default_rng(2).standard_normal((10, 4)), - columns=Index(list("ABCD"), dtype=object), + columns=Index(list("ABCD")), index=date_range("2000-01-01", periods=10, freq="B"), ) df["string"] = "foo" @@ -776,7 +774,7 @@ def test_invalid_filtering(setup_path): df = DataFrame( np.random.default_rng(2).standard_normal((10, 4)), - columns=Index(list("ABCD"), dtype=object), + columns=Index(list("ABCD")), index=date_range("2000-01-01", periods=10, freq="B"), ) @@ -798,7 +796,7 @@ def test_string_select(setup_path): with ensure_clean_store(setup_path) as store: df = DataFrame( np.random.default_rng(2).standard_normal((10, 4)), - columns=Index(list("ABCD"), dtype=object), + columns=Index(list("ABCD")), index=date_range("2000-01-01", periods=10, freq="B"), ) @@ -842,7 +840,7 @@ def test_string_select(setup_path): def test_select_as_multiple(setup_path): df1 = DataFrame( np.random.default_rng(2).standard_normal((10, 4)), - columns=Index(list("ABCD"), dtype=object), + columns=Index(list("ABCD")), index=date_range("2000-01-01", periods=10, freq="B"), ) df2 = df1.copy().rename(columns="{}_2".format) @@ -967,6 +965,7 @@ def test_query_long_float_literal(setup_path): tm.assert_frame_equal(expected, result) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_query_compare_column_type(setup_path): # GH 15492 df = DataFrame( @@ -1043,7 +1042,6 @@ def test_select_large_integer(tmp_path): ), columns=["x", "y"], ) - result = None with HDFStore(path) as s: s.append("data", df, data_columns=True, index=False) result = s.select("data", where="y==-9223372036854775801").get("y").get(0) diff --git a/pandas/tests/io/pytables/test_store.py b/pandas/tests/io/pytables/test_store.py index 8a33cccf62fcf..c349d2143ad11 100644 --- a/pandas/tests/io/pytables/test_store.py +++ b/pandas/tests/io/pytables/test_store.py @@ -9,6 +9,8 @@ from pandas._config import using_string_dtype +from pandas.compat import HAS_PYARROW + import pandas as pd from pandas import ( DataFrame, @@ -33,10 +35,7 @@ read_hdf, ) -pytestmark = [ - pytest.mark.single_cpu, - pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False), -] +pytestmark = [pytest.mark.single_cpu] tables = pytest.importorskip("tables") @@ -108,7 +107,7 @@ def test_iter_empty(setup_path): assert list(store) == [] -def test_repr(setup_path): +def test_repr(setup_path, using_infer_string): with ensure_clean_store(setup_path) as store: repr(store) store.info() @@ -143,7 +142,9 @@ def test_repr(setup_path): df.loc[df.index[3:6], ["obj1"]] = np.nan df = df._consolidate() - with tm.assert_produces_warning(pd.errors.PerformanceWarning): + warning = None if using_infer_string else pd.errors.PerformanceWarning + msg = "cannot\nmap directly to c-types .* dtype='object'" + with tm.assert_produces_warning(warning, match=msg): store["df"] = df # make a random group in hdf space @@ -314,7 +315,7 @@ def test_getattr(setup_path): df = DataFrame( np.random.default_rng(2).standard_normal((10, 4)), - columns=Index(list("ABCD"), dtype=object), + columns=Index(list("ABCD")), index=date_range("2000-01-01", periods=10, freq="B"), ) store["df"] = df @@ -381,7 +382,7 @@ def test_to_hdf_with_min_itemsize(tmp_path, setup_path): { "A": [0.0, 1.0, 2.0, 3.0, 4.0], "B": [0.0, 1.0, 0.0, 1.0, 0.0], - "C": Index(["foo1", "foo2", "foo3", "foo4", "foo5"], dtype=object), + "C": Index(["foo1", "foo2", "foo3", "foo4", "foo5"]), "D": date_range("20130101", periods=5), } ).set_index("C") @@ -397,6 +398,10 @@ def test_to_hdf_with_min_itemsize(tmp_path, setup_path): tm.assert_series_equal(read_hdf(path, "ss4"), concat([df["B"], df2["B"]])) +@pytest.mark.xfail( + using_string_dtype() and HAS_PYARROW, + reason="TODO(infer_string): can't encode '\ud800': surrogates not allowed", +) @pytest.mark.parametrize("format", ["fixed", "table"]) def test_to_hdf_errors(tmp_path, format, setup_path): data = ["\ud800foo"] @@ -418,7 +423,7 @@ def col(t, column): # data columns df = DataFrame( np.random.default_rng(2).standard_normal((10, 4)), - columns=Index(list("ABCD"), dtype=object), + columns=Index(list("ABCD")), index=date_range("2000-01-01", periods=10, freq="B"), ) df["string"] = "foo" @@ -453,7 +458,7 @@ def col(t, column): # data columns df = DataFrame( np.random.default_rng(2).standard_normal((10, 4)), - columns=Index(list("ABCD"), dtype=object), + columns=Index(list("ABCD")), index=date_range("2000-01-01", periods=10, freq="B"), ) df["string"] = "foo" @@ -495,8 +500,8 @@ def test_table_mixed_dtypes(setup_path): # frame df = DataFrame( 1.1 * np.arange(120).reshape((30, 4)), - columns=Index(list("ABCD"), dtype=object), - index=Index([f"i-{i}" for i in range(30)], dtype=object), + columns=Index(list("ABCD")), + index=Index([f"i-{i}" for i in range(30)]), ) df["obj1"] = "foo" df["obj2"] = "bar" @@ -551,8 +556,8 @@ def test_remove(setup_path): ) df = DataFrame( 1.1 * np.arange(120).reshape((30, 4)), - columns=Index(list("ABCD"), dtype=object), - index=Index([f"i-{i}" for i in range(30)], dtype=object), + columns=Index(list("ABCD")), + index=Index([f"i-{i}" for i in range(30)]), ) store["a"] = ts store["b"] = df @@ -615,8 +620,8 @@ def test_same_name_scoping(setup_path): def test_store_index_name(setup_path): df = DataFrame( 1.1 * np.arange(120).reshape((30, 4)), - columns=Index(list("ABCD"), dtype=object), - index=Index([f"i-{i}" for i in range(30)], dtype=object), + columns=Index(list("ABCD")), + index=Index([f"i-{i}" for i in range(30)]), ) df.index.name = "foo" @@ -658,8 +663,8 @@ def test_store_index_name_numpy_str(tmp_path, table_format, setup_path, unit, tz def test_store_series_name(setup_path): df = DataFrame( 1.1 * np.arange(120).reshape((30, 4)), - columns=Index(list("ABCD"), dtype=object), - index=Index([f"i-{i}" for i in range(30)], dtype=object), + columns=Index(list("ABCD")), + index=Index([f"i-{i}" for i in range(30)]), ) series = df["A"] @@ -673,7 +678,7 @@ def test_overwrite_node(setup_path): with ensure_clean_store(setup_path) as store: store["a"] = DataFrame( np.random.default_rng(2).standard_normal((10, 4)), - columns=Index(list("ABCD"), dtype=object), + columns=Index(list("ABCD")), index=date_range("2000-01-01", periods=10, freq="B"), ) ts = Series( @@ -687,7 +692,7 @@ def test_overwrite_node(setup_path): def test_coordinates(setup_path): df = DataFrame( np.random.default_rng(2).standard_normal((10, 4)), - columns=Index(list("ABCD"), dtype=object), + columns=Index(list("ABCD")), index=date_range("2000-01-01", periods=10, freq="B"), ) @@ -722,7 +727,7 @@ def test_coordinates(setup_path): _maybe_remove(store, "df2") df1 = DataFrame( np.random.default_rng(2).standard_normal((10, 4)), - columns=Index(list("ABCD"), dtype=object), + columns=Index(list("ABCD")), index=date_range("2000-01-01", periods=10, freq="B"), ) df2 = df1.copy().rename(columns="{}_2".format) @@ -878,8 +883,8 @@ def test_start_stop_fixed(setup_path): # sparse; not implemented df = DataFrame( 1.1 * np.arange(120).reshape((30, 4)), - columns=Index(list("ABCD"), dtype=object), - index=Index([f"i-{i}" for i in range(30)], dtype=object), + columns=Index(list("ABCD")), + index=Index([f"i-{i}" for i in range(30)]), ) df.iloc[3:5, 1:3] = np.nan df.iloc[8:10, -2] = np.nan @@ -905,8 +910,8 @@ def test_select_filter_corner(setup_path): def test_path_pathlib(): df = DataFrame( 1.1 * np.arange(120).reshape((30, 4)), - columns=Index(list("ABCD"), dtype=object), - index=Index([f"i-{i}" for i in range(30)], dtype=object), + columns=Index(list("ABCD")), + index=Index([f"i-{i}" for i in range(30)]), ) result = tm.round_trip_pathlib( @@ -935,8 +940,8 @@ def test_contiguous_mixed_data_table(start, stop, setup_path): def test_path_pathlib_hdfstore(): df = DataFrame( 1.1 * np.arange(120).reshape((30, 4)), - columns=Index(list("ABCD"), dtype=object), - index=Index([f"i-{i}" for i in range(30)], dtype=object), + columns=Index(list("ABCD")), + index=Index([f"i-{i}" for i in range(30)]), ) def writer(path): @@ -954,8 +959,8 @@ def reader(path): def test_pickle_path_localpath(): df = DataFrame( 1.1 * np.arange(120).reshape((30, 4)), - columns=Index(list("ABCD"), dtype=object), - index=Index([f"i-{i}" for i in range(30)], dtype=object), + columns=Index(list("ABCD")), + index=Index([f"i-{i}" for i in range(30)]), ) result = tm.round_trip_pathlib( lambda p: df.to_hdf(p, key="df"), lambda p: read_hdf(p, "df") @@ -986,8 +991,8 @@ def reader(path): def test_copy(propindexes): df = DataFrame( 1.1 * np.arange(120).reshape((30, 4)), - columns=Index(list("ABCD"), dtype=object), - index=Index([f"i-{i}" for i in range(30)], dtype=object), + columns=Index(list("ABCD")), + index=Index([f"i-{i}" for i in range(30)]), ) with tm.ensure_clean() as path: diff --git a/pandas/tests/io/pytables/test_timezones.py b/pandas/tests/io/pytables/test_timezones.py index 05d630dc0e47c..c5613daf62207 100644 --- a/pandas/tests/io/pytables/test_timezones.py +++ b/pandas/tests/io/pytables/test_timezones.py @@ -6,8 +6,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas._libs.tslibs.timezones import maybe_get_tz import pandas.util._test_decorators as td @@ -25,10 +23,6 @@ ensure_clean_store, ) -pytestmark = pytest.mark.xfail( - using_string_dtype(), reason="TODO(infer_string)", strict=False -) - def _compare_with_tz(a, b): tm.assert_frame_equal(a, b) From e0f47b77557bc990211c492a72405ecc2b0ff5f2 Mon Sep 17 00:00:00 2001 From: Richard Shadrach <45562402+rhshadrach@users.noreply.github.com> Date: Mon, 17 Feb 2025 20:31:00 -0500 Subject: [PATCH 3/5] BUG(string dtype): Resolve pytables xfail when reading with condition (#60943) (cherry picked from commit 0ec5f2668e9568d90595180d5ee925305ec7182e) --- pandas/io/pytables.py | 18 ++++++++++++++++-- pandas/tests/io/pytables/test_append.py | 3 --- pandas/tests/io/pytables/test_categorical.py | 3 --- pandas/tests/io/pytables/test_read.py | 3 --- pandas/tests/io/pytables/test_select.py | 4 ---- 5 files changed, 16 insertions(+), 15 deletions(-) diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index d93a3f26934a0..65f95dab7b42f 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -4093,6 +4093,8 @@ def _create_axes( ordered = data_converted.ordered meta = "category" metadata = np.asarray(data_converted.categories).ravel() + elif isinstance(blk.dtype, StringDtype): + meta = str(blk.dtype) data, dtype_name = _get_data_and_dtype_name(data_converted) @@ -4360,7 +4362,9 @@ def read_column( encoding=self.encoding, errors=self.errors, ) - return Series(_set_tz(col_values[1], a.tz), name=column, copy=False) + cvs = _set_tz(col_values[1], a.tz) + dtype = getattr(self.table.attrs, f"{column}_meta", None) + return Series(cvs, name=column, copy=False, dtype=dtype) raise KeyError(f"column [{column}] not found in the table") @@ -4708,8 +4712,18 @@ def read( df = DataFrame._from_arrays([values], columns=cols_, index=index_) if not (using_string_dtype() and values.dtype.kind == "O"): assert (df.dtypes == values.dtype).all(), (df.dtypes, values.dtype) + + # If str / string dtype is stored in meta, use that. + converted = False + for column in cols_: + dtype = getattr(self.table.attrs, f"{column}_meta", None) + if dtype in ["str", "string"]: + df[column] = df[column].astype(dtype) + converted = True + # Otherwise try inference. if ( - using_string_dtype() + not converted + and using_string_dtype() and isinstance(values, np.ndarray) and is_string_array( values, diff --git a/pandas/tests/io/pytables/test_append.py b/pandas/tests/io/pytables/test_append.py index 39c203c558a5b..d0246c8f58d6a 100644 --- a/pandas/tests/io/pytables/test_append.py +++ b/pandas/tests/io/pytables/test_append.py @@ -5,8 +5,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas._libs.tslibs import Timestamp import pandas.util._test_decorators as td @@ -507,7 +505,6 @@ def test_append_with_empty_string(setup_path): tm.assert_frame_equal(store.select("df"), df) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_append_with_data_columns(setup_path): with ensure_clean_store(setup_path) as store: df = DataFrame( diff --git a/pandas/tests/io/pytables/test_categorical.py b/pandas/tests/io/pytables/test_categorical.py index a875e19ea7f0e..449bc5cf1fc57 100644 --- a/pandas/tests/io/pytables/test_categorical.py +++ b/pandas/tests/io/pytables/test_categorical.py @@ -1,8 +1,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas import ( Categorical, DataFrame, @@ -140,7 +138,6 @@ def test_categorical(setup_path): store.select("df3/meta/s/meta") -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_categorical_conversion(tmp_path, setup_path): # GH13322 # Check that read_hdf with categorical columns doesn't return rows if diff --git a/pandas/tests/io/pytables/test_read.py b/pandas/tests/io/pytables/test_read.py index bfebf18c0e0ab..5bec673ad3c70 100644 --- a/pandas/tests/io/pytables/test_read.py +++ b/pandas/tests/io/pytables/test_read.py @@ -5,8 +5,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas._libs.tslibs import Timestamp from pandas.compat import is_platform_windows @@ -74,7 +72,6 @@ def test_read_missing_key_opened_store(tmp_path, setup_path): read_hdf(store, "k1") -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_read_column(setup_path): df = DataFrame( np.random.default_rng(2).standard_normal((10, 4)), diff --git a/pandas/tests/io/pytables/test_select.py b/pandas/tests/io/pytables/test_select.py index f781b6756fec9..e76934745f004 100644 --- a/pandas/tests/io/pytables/test_select.py +++ b/pandas/tests/io/pytables/test_select.py @@ -1,8 +1,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas._libs.tslibs import Timestamp import pandas as pd @@ -651,7 +649,6 @@ def test_frame_select(setup_path): # store.select('frame', [crit1, crit2]) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_frame_select_complex(setup_path): # select via complex criteria @@ -965,7 +962,6 @@ def test_query_long_float_literal(setup_path): tm.assert_frame_equal(expected, result) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_query_compare_column_type(setup_path): # GH 15492 df = DataFrame( From 3d5c84b5f9a4b5986bae5e5fd136de2f27b23be6 Mon Sep 17 00:00:00 2001 From: Richard Shadrach <45562402+rhshadrach@users.noreply.github.com> Date: Tue, 18 Feb 2025 12:39:50 -0500 Subject: [PATCH 4/5] Backport PR #60940: ENH: Add dtype argument to str.decode --- doc/source/whatsnew/v2.3.0.rst | 1 + pandas/core/strings/accessor.py | 18 ++++++++++++++++-- pandas/tests/strings/test_strings.py | 24 ++++++++++++++++++++++++ 3 files changed, 41 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v2.3.0.rst b/doc/source/whatsnew/v2.3.0.rst index 0b7c2bac1be6a..c4e01a86ce843 100644 --- a/doc/source/whatsnew/v2.3.0.rst +++ b/doc/source/whatsnew/v2.3.0.rst @@ -37,6 +37,7 @@ Other enhancements updated to raise FutureWarning with NumPy >= 2 (:issue:`60340`) - :meth:`Series.str.decode` result now has ``StringDtype`` when ``future.infer_string`` is True (:issue:`60709`) - :meth:`~Series.to_hdf` and :meth:`~DataFrame.to_hdf` now round-trip with ``StringDtype`` (:issue:`60663`) +- The :meth:`Series.str.decode` has gained the argument ``dtype`` to control the dtype of the result (:issue:`60940`) - The :meth:`~Series.cumsum`, :meth:`~Series.cummin`, and :meth:`~Series.cummax` reductions are now implemented for ``StringDtype`` columns when backed by PyArrow (:issue:`60633`) - The :meth:`~Series.sum` reduction is now implemented for ``StringDtype`` columns (:issue:`59853`) diff --git a/pandas/core/strings/accessor.py b/pandas/core/strings/accessor.py index 3c4cf60ab262a..c0e458f7968e7 100644 --- a/pandas/core/strings/accessor.py +++ b/pandas/core/strings/accessor.py @@ -33,6 +33,7 @@ is_list_like, is_object_dtype, is_re, + is_string_dtype, ) from pandas.core.dtypes.dtypes import ( ArrowDtype, @@ -1981,7 +1982,9 @@ def slice_replace(self, start=None, stop=None, repl=None): result = self._data.array._str_slice_replace(start, stop, repl) return self._wrap_result(result) - def decode(self, encoding, errors: str = "strict"): + def decode( + self, encoding, errors: str = "strict", dtype: str | DtypeObj | None = None + ): """ Decode character string in the Series/Index using indicated encoding. @@ -1992,6 +1995,14 @@ def decode(self, encoding, errors: str = "strict"): ---------- encoding : str errors : str, optional + Specifies the error handling scheme. + Possible values are those supported by :meth:`bytes.decode`. + dtype : str or dtype, optional + The dtype of the result. When not ``None``, must be either a string or + object dtype. When ``None``, the dtype of the result is determined by + ``pd.options.future.infer_string``. + + .. versionadded:: 2.3.0 Returns ------- @@ -2008,6 +2019,10 @@ def decode(self, encoding, errors: str = "strict"): 2 () dtype: object """ + if dtype is not None and not is_string_dtype(dtype): + raise ValueError(f"dtype must be string or object, got {dtype=}") + if dtype is None and get_option("future.infer_string"): + dtype = "str" # TODO: Add a similar _bytes interface. if encoding in _cpython_optimized_decoders: # CPython optimized implementation @@ -2017,7 +2032,6 @@ def decode(self, encoding, errors: str = "strict"): f = lambda x: decoder(x, errors)[0] arr = self._data.array result = arr._str_map(f) - dtype = "str" if get_option("future.infer_string") else None return self._wrap_result(result, dtype=dtype) @forbid_nonstring_types(["bytes"]) diff --git a/pandas/tests/strings/test_strings.py b/pandas/tests/strings/test_strings.py index 59a06a421f53e..c729b910d05a7 100644 --- a/pandas/tests/strings/test_strings.py +++ b/pandas/tests/strings/test_strings.py @@ -599,6 +599,30 @@ def test_decode_errors_kwarg(): tm.assert_series_equal(result, expected) +def test_decode_string_dtype(string_dtype): + # https://github.com/pandas-dev/pandas/pull/60940 + ser = Series([b"a", b"b"]) + result = ser.str.decode("utf-8", dtype=string_dtype) + expected = Series(["a", "b"], dtype=string_dtype) + tm.assert_series_equal(result, expected) + + +def test_decode_object_dtype(object_dtype): + # https://github.com/pandas-dev/pandas/pull/60940 + ser = Series([b"a", rb"\ud800"]) + result = ser.str.decode("utf-8", dtype=object_dtype) + expected = Series(["a", r"\ud800"], dtype=object_dtype) + tm.assert_series_equal(result, expected) + + +def test_decode_bad_dtype(): + # https://github.com/pandas-dev/pandas/pull/60940 + ser = Series([b"a", b"b"]) + msg = "dtype must be string or object, got dtype='int64'" + with pytest.raises(ValueError, match=msg): + ser.str.decode("utf-8", dtype="int64") + + @pytest.mark.parametrize( "form, expected", [ From 5e5db791fe78466d98d1da76d0ee5b9afbb61c21 Mon Sep 17 00:00:00 2001 From: Richard Shadrach <45562402+rhshadrach@users.noreply.github.com> Date: Tue, 18 Feb 2025 21:35:41 -0500 Subject: [PATCH 5/5] Backport PR #60938: ENH(string dtype): Implement cumsum for Python-backed strings --- doc/source/whatsnew/v2.3.0.rst | 2 +- pandas/core/arrays/string_.py | 83 ++++++++++++++++++++++++++ pandas/tests/apply/test_str.py | 15 +---- pandas/tests/extension/test_string.py | 6 +- pandas/tests/series/test_cumulative.py | 11 ++-- 5 files changed, 92 insertions(+), 25 deletions(-) diff --git a/doc/source/whatsnew/v2.3.0.rst b/doc/source/whatsnew/v2.3.0.rst index c4e01a86ce843..db3dcb50bacd0 100644 --- a/doc/source/whatsnew/v2.3.0.rst +++ b/doc/source/whatsnew/v2.3.0.rst @@ -38,7 +38,7 @@ Other enhancements - :meth:`Series.str.decode` result now has ``StringDtype`` when ``future.infer_string`` is True (:issue:`60709`) - :meth:`~Series.to_hdf` and :meth:`~DataFrame.to_hdf` now round-trip with ``StringDtype`` (:issue:`60663`) - The :meth:`Series.str.decode` has gained the argument ``dtype`` to control the dtype of the result (:issue:`60940`) -- The :meth:`~Series.cumsum`, :meth:`~Series.cummin`, and :meth:`~Series.cummax` reductions are now implemented for ``StringDtype`` columns when backed by PyArrow (:issue:`60633`) +- The :meth:`~Series.cumsum`, :meth:`~Series.cummin`, and :meth:`~Series.cummax` reductions are now implemented for ``StringDtype`` columns (:issue:`60633`) - The :meth:`~Series.sum` reduction is now implemented for ``StringDtype`` columns (:issue:`59853`) .. --------------------------------------------------------------------------- diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index 3efb48c86e92c..c1048e806ff9a 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -46,6 +46,7 @@ ) from pandas.core import ( + missing, nanops, ops, ) @@ -865,6 +866,88 @@ def _reduce( return result raise TypeError(f"Cannot perform reduction '{name}' with string dtype") + def _accumulate(self, name: str, *, skipna: bool = True, **kwargs) -> StringArray: + """ + Return an ExtensionArray performing an accumulation operation. + + The underlying data type might change. + + Parameters + ---------- + name : str + Name of the function, supported values are: + - cummin + - cummax + - cumsum + - cumprod + skipna : bool, default True + If True, skip NA values. + **kwargs + Additional keyword arguments passed to the accumulation function. + Currently, there is no supported kwarg. + + Returns + ------- + array + + Raises + ------ + NotImplementedError : subclass does not define accumulations + """ + if name == "cumprod": + msg = f"operation '{name}' not supported for dtype '{self.dtype}'" + raise TypeError(msg) + + # We may need to strip out trailing NA values + tail: np.ndarray | None = None + na_mask: np.ndarray | None = None + ndarray = self._ndarray + np_func = { + "cumsum": np.cumsum, + "cummin": np.minimum.accumulate, + "cummax": np.maximum.accumulate, + }[name] + + if self._hasna: + na_mask = cast("npt.NDArray[np.bool_]", isna(ndarray)) + if np.all(na_mask): + return type(self)(ndarray) + if skipna: + if name == "cumsum": + ndarray = np.where(na_mask, "", ndarray) + else: + # We can retain the running min/max by forward/backward filling. + ndarray = ndarray.copy() + missing.pad_or_backfill_inplace( + ndarray, + method="pad", + axis=0, + ) + missing.pad_or_backfill_inplace( + ndarray, + method="backfill", + axis=0, + ) + else: + # When not skipping NA values, the result should be null from + # the first NA value onward. + idx = np.argmax(na_mask) + tail = np.empty(len(ndarray) - idx, dtype="object") + tail[:] = self.dtype.na_value + ndarray = ndarray[:idx] + + # mypy: Cannot call function of unknown type + np_result = np_func(ndarray) # type: ignore[operator] + + if tail is not None: + np_result = np.hstack((np_result, tail)) + elif na_mask is not None: + # Argument 2 to "where" has incompatible type "NAType | float" + np_result = np.where(na_mask, self.dtype.na_value, np_result) # type: ignore[arg-type] + + result = type(self)(np_result) + return result + def _wrap_reduction_result(self, axis: AxisInt | None, result) -> Any: if self.dtype.na_value is np.nan and result is libmissing.NA: # the masked_reductions use pd.NA -> convert to np.nan diff --git a/pandas/tests/apply/test_str.py b/pandas/tests/apply/test_str.py index 9c7836a0aa167..17e8322dc40e1 100644 --- a/pandas/tests/apply/test_str.py +++ b/pandas/tests/apply/test_str.py @@ -4,8 +4,6 @@ import numpy as np import pytest -from pandas.compat import HAS_PYARROW - from pandas.core.dtypes.common import is_number from pandas import ( @@ -168,21 +166,10 @@ def test_agg_cython_table_series(series, func, expected): ), ), ) -def test_agg_cython_table_transform_series(request, series, func, expected): +def test_agg_cython_table_transform_series(series, func, expected): # GH21224 # test transforming functions in # pandas.core.base.SelectionMixin._cython_table (cumprod, cumsum) - if ( - series.dtype == "string" - and func in ("cumsum", np.cumsum, np.nancumsum) - and not HAS_PYARROW - ): - request.applymarker( - pytest.mark.xfail( - raises=NotImplementedError, - reason="TODO(infer_string) cumsum not yet implemented for string", - ) - ) warn = None if isinstance(func, str) else FutureWarning with tm.assert_produces_warning(warn, match="is currently using Series.*"): result = series.agg(func) diff --git a/pandas/tests/extension/test_string.py b/pandas/tests/extension/test_string.py index 301c7ee851aa0..526cf426781ad 100644 --- a/pandas/tests/extension/test_string.py +++ b/pandas/tests/extension/test_string.py @@ -200,11 +200,7 @@ def _supports_reduction(self, ser: pd.Series, op_name: str) -> bool: def _supports_accumulation(self, ser: pd.Series, op_name: str) -> bool: assert isinstance(ser.dtype, StorageExtensionDtype) - return ser.dtype.storage == "pyarrow" and op_name in [ - "cummin", - "cummax", - "cumsum", - ] + return op_name in ["cummin", "cummax", "cumsum"] def _cast_pointwise_result(self, op_name: str, obj, other, pointwise_result): dtype = cast(StringDtype, tm.get_dtype(obj)) diff --git a/pandas/tests/series/test_cumulative.py b/pandas/tests/series/test_cumulative.py index 0dc391db2182b..97f5fb4a9f96f 100644 --- a/pandas/tests/series/test_cumulative.py +++ b/pandas/tests/series/test_cumulative.py @@ -193,13 +193,14 @@ def test_cumprod_timedelta(self): ([pd.NA, pd.NA, pd.NA], "cummax", False, [pd.NA, pd.NA, pd.NA]), ], ) - def test_cum_methods_pyarrow_strings( - self, pyarrow_string_dtype, data, op, skipna, expected_data + def test_cum_methods_ea_strings( + self, string_dtype_no_object, data, op, skipna, expected_data ): - # https://github.com/pandas-dev/pandas/pull/60633 - ser = pd.Series(data, dtype=pyarrow_string_dtype) + # https://github.com/pandas-dev/pandas/pull/60633 - pyarrow + # https://github.com/pandas-dev/pandas/pull/60938 - Python + ser = pd.Series(data, dtype=string_dtype_no_object) method = getattr(ser, op) - expected = pd.Series(expected_data, dtype=pyarrow_string_dtype) + expected = pd.Series(expected_data, dtype=string_dtype_no_object) result = method(skipna=skipna) tm.assert_series_equal(result, expected)