diff --git a/pandas/tests/io/json/test_json_table_schema.py b/pandas/tests/io/json/test_json_table_schema.py index 79dbe448e9cbe..7569a74752bf2 100644 --- a/pandas/tests/io/json/test_json_table_schema.py +++ b/pandas/tests/io/json/test_json_table_schema.py @@ -56,7 +56,7 @@ def df_table(): class TestBuildSchema: - def test_build_table_schema(self, df_schema): + def test_build_table_schema(self, df_schema, using_infer_string): result = build_table_schema(df_schema, version=False) expected = { "fields": [ @@ -68,6 +68,8 @@ def test_build_table_schema(self, df_schema): ], "primaryKey": ["idx"], } + if using_infer_string: + expected["fields"][2] = {"name": "B", "type": "any", "extDtype": "string"} assert result == expected result = build_table_schema(df_schema) assert "pandas_version" in result @@ -97,7 +99,7 @@ def test_series_unnamed(self): } assert result == expected - def test_multiindex(self, df_schema): + def test_multiindex(self, df_schema, using_infer_string): df = df_schema idx = pd.MultiIndex.from_product([("a", "b"), (1, 2)]) df.index = idx @@ -114,6 +116,13 @@ def test_multiindex(self, df_schema): ], "primaryKey": ["level_0", "level_1"], } + if using_infer_string: + expected["fields"][0] = { + "name": "level_0", + "type": "any", + "extDtype": "string", + } + expected["fields"][3] = {"name": "B", "type": "any", "extDtype": "string"} assert result == expected df.index.names = ["idx0", None] @@ -156,7 +165,10 @@ def test_as_json_table_type_bool_data(self, bool_type): def test_as_json_table_type_date_data(self, date_data): assert as_json_table_type(date_data.dtype) == "datetime" - @pytest.mark.parametrize("str_data", [pd.Series(["a", "b"]), pd.Index(["a", "b"])]) + @pytest.mark.parametrize( + "str_data", + [pd.Series(["a", "b"], dtype=object), pd.Index(["a", "b"], dtype=object)], + ) def test_as_json_table_type_string_data(self, str_data): assert as_json_table_type(str_data.dtype) == "string" @@ -261,7 +273,7 @@ def test_read_json_from_to_json_results(self): tm.assert_frame_equal(result1, df) tm.assert_frame_equal(result2, df) - def test_to_json(self, df_table): + def test_to_json(self, df_table, using_infer_string): df = df_table df.index.name = "idx" result = df.to_json(orient="table", date_format="iso") @@ -292,6 +304,9 @@ def test_to_json(self, df_table): {"name": "H", "type": "datetime", "tz": "US/Central"}, ] + if using_infer_string: + fields[2] = {"name": "B", "type": "any", "extDtype": "string"} + schema = {"fields": fields, "primaryKey": ["idx"]} data = [ OrderedDict( diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index 5275050391ca3..052356fdc96ed 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -13,6 +13,8 @@ import numpy as np import pytest +from pandas._config import using_pyarrow_string_dtype + from pandas.compat import IS64 import pandas.util._test_decorators as td @@ -30,6 +32,7 @@ ArrowStringArray, StringArray, ) +from pandas.core.arrays.string_arrow import ArrowStringArrayNumpySemantics from pandas.io.json import ujson_dumps @@ -238,7 +241,7 @@ def test_roundtrip_str_axes(self, orient, convert_axes, dtype): @pytest.mark.parametrize("convert_axes", [True, False]) def test_roundtrip_categorical( - self, request, orient, categorical_frame, convert_axes + self, request, orient, categorical_frame, convert_axes, using_infer_string ): # TODO: create a better frame to test with and improve coverage if orient in ("index", "columns"): @@ -252,7 +255,9 @@ def test_roundtrip_categorical( result = read_json(data, orient=orient, convert_axes=convert_axes) expected = categorical_frame.copy() - expected.index = expected.index.astype(str) # Categorical not preserved + expected.index = expected.index.astype( + str if not using_infer_string else "string[pyarrow_numpy]" + ) # Categorical not preserved expected.index.name = None # index names aren't preserved in JSON assert_json_roundtrip_equal(result, expected, orient) @@ -518,9 +523,9 @@ def test_v12_compat(self, datapath): df_iso = df.drop(["modified"], axis=1) v12_iso_json = os.path.join(dirpath, "tsframe_iso_v012.json") df_unser_iso = read_json(v12_iso_json) - tm.assert_frame_equal(df_iso, df_unser_iso) + tm.assert_frame_equal(df_iso, df_unser_iso, check_column_type=False) - def test_blocks_compat_GH9037(self): + def test_blocks_compat_GH9037(self, using_infer_string): index = pd.date_range("20000101", periods=10, freq="h") # freq doesn't round-trip index = DatetimeIndex(list(index), freq=None) @@ -604,7 +609,9 @@ def test_blocks_compat_GH9037(self): ) # JSON deserialisation always creates unicode strings - df_mixed.columns = df_mixed.columns.astype(np.str_) + df_mixed.columns = df_mixed.columns.astype( + np.str_ if not using_infer_string else "string[pyarrow_numpy]" + ) data = StringIO(df_mixed.to_json(orient="split")) df_roundtrip = read_json(data, orient="split") tm.assert_frame_equal( @@ -676,16 +683,19 @@ def test_series_non_unique_index(self): unserialized = read_json( StringIO(s.to_json(orient="records")), orient="records", typ="series" ) - tm.assert_numpy_array_equal(s.values, unserialized.values) + tm.assert_equal(s.values, unserialized.values) def test_series_default_orient(self, string_series): assert string_series.to_json() == string_series.to_json(orient="index") - def test_series_roundtrip_simple(self, orient, string_series): + def test_series_roundtrip_simple(self, orient, string_series, using_infer_string): data = StringIO(string_series.to_json(orient=orient)) result = read_json(data, typ="series", orient=orient) expected = string_series + if using_infer_string and orient in ("split", "index", "columns"): + # These schemas don't contain dtypes, so we infer string + expected.index = expected.index.astype("string[pyarrow_numpy]") if orient in ("values", "records"): expected = expected.reset_index(drop=True) if orient != "split": @@ -1459,6 +1469,9 @@ def test_from_json_to_json_table_dtypes(self): result = read_json(StringIO(dfjson), orient="table") tm.assert_frame_equal(result, expected) + # TODO: We are casting to string which coerces None to NaN before casting back + # to object, ending up with incorrect na values + @pytest.mark.xfail(using_pyarrow_string_dtype(), reason="incorrect na conversion") @pytest.mark.parametrize("orient", ["split", "records", "index", "columns"]) def test_to_json_from_json_columns_dtypes(self, orient): # GH21892 GH33205 @@ -1716,6 +1729,11 @@ def test_to_json_indent(self, indent): assert result == expected + @pytest.mark.skipif( + using_pyarrow_string_dtype(), + reason="Adjust expected when infer_string is default, no bug here, " + "just a complicated parametrization", + ) @pytest.mark.parametrize( "orient,expected", [ @@ -1991,7 +2009,9 @@ def test_json_uint64(self): @pytest.mark.parametrize( "orient", ["split", "records", "values", "index", "columns"] ) - def test_read_json_dtype_backend(self, string_storage, dtype_backend, orient): + def test_read_json_dtype_backend( + self, string_storage, dtype_backend, orient, using_infer_string + ): # GH#50750 pa = pytest.importorskip("pyarrow") df = DataFrame( @@ -2007,7 +2027,10 @@ def test_read_json_dtype_backend(self, string_storage, dtype_backend, orient): } ) - if string_storage == "python": + if using_infer_string: + string_array = ArrowStringArrayNumpySemantics(pa.array(["a", "b", "c"])) + string_array_na = ArrowStringArrayNumpySemantics(pa.array(["a", "b", None])) + elif string_storage == "python": string_array = StringArray(np.array(["a", "b", "c"], dtype=np.object_)) string_array_na = StringArray(np.array(["a", "b", NA], dtype=np.object_))