Adjust tests in json folder for new string option (#56197)

phofl · web-flow · commit 02324e677468 · 2023-11-30T09:41:24.000-08:00
* BUG: read_json not handling string dtype when converting to dates

* Adjust tests in json folder for new string option
diff --git a/pandas/tests/io/json/test_json_table_schema.py b/pandas/tests/io/json/test_json_table_schema.py
@@ -56,7 +56,7 @@ def df_table():
 
 
 class TestBuildSchema:
-    def test_build_table_schema(self, df_schema):
+    def test_build_table_schema(self, df_schema, using_infer_string):
         result = build_table_schema(df_schema, version=False)
         expected = {
             "fields": [
@@ -68,6 +68,8 @@ def test_build_table_schema(self, df_schema):
             ],
             "primaryKey": ["idx"],
         }
+        if using_infer_string:
+            expected["fields"][2] = {"name": "B", "type": "any", "extDtype": "string"}
         assert result == expected
         result = build_table_schema(df_schema)
         assert "pandas_version" in result
@@ -97,7 +99,7 @@ def test_series_unnamed(self):
         }
         assert result == expected
 
-    def test_multiindex(self, df_schema):
+    def test_multiindex(self, df_schema, using_infer_string):
         df = df_schema
         idx = pd.MultiIndex.from_product([("a", "b"), (1, 2)])
         df.index = idx
@@ -114,6 +116,13 @@ def test_multiindex(self, df_schema):
             ],
             "primaryKey": ["level_0", "level_1"],
         }
+        if using_infer_string:
+            expected["fields"][0] = {
+                "name": "level_0",
+                "type": "any",
+                "extDtype": "string",
+            }
+            expected["fields"][3] = {"name": "B", "type": "any", "extDtype": "string"}
         assert result == expected
 
         df.index.names = ["idx0", None]
@@ -156,7 +165,10 @@ def test_as_json_table_type_bool_data(self, bool_type):
     def test_as_json_table_type_date_data(self, date_data):
         assert as_json_table_type(date_data.dtype) == "datetime"
 
-    @pytest.mark.parametrize("str_data", [pd.Series(["a", "b"]), pd.Index(["a", "b"])])
+    @pytest.mark.parametrize(
+        "str_data",
+        [pd.Series(["a", "b"], dtype=object), pd.Index(["a", "b"], dtype=object)],
+    )
     def test_as_json_table_type_string_data(self, str_data):
         assert as_json_table_type(str_data.dtype) == "string"
 
@@ -261,7 +273,7 @@ def test_read_json_from_to_json_results(self):
         tm.assert_frame_equal(result1, df)
         tm.assert_frame_equal(result2, df)
 
-    def test_to_json(self, df_table):
+    def test_to_json(self, df_table, using_infer_string):
         df = df_table
         df.index.name = "idx"
         result = df.to_json(orient="table", date_format="iso")
@@ -292,6 +304,9 @@ def test_to_json(self, df_table):
             {"name": "H", "type": "datetime", "tz": "US/Central"},
         ]
 
+        if using_infer_string:
+            fields[2] = {"name": "B", "type": "any", "extDtype": "string"}
+
         schema = {"fields": fields, "primaryKey": ["idx"]}
         data = [
             OrderedDict(
diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py
@@ -13,6 +13,8 @@
 import numpy as np
 import pytest
 
+from pandas._config import using_pyarrow_string_dtype
+
 from pandas.compat import IS64
 import pandas.util._test_decorators as td
 
@@ -30,6 +32,7 @@
     ArrowStringArray,
     StringArray,
 )
+from pandas.core.arrays.string_arrow import ArrowStringArrayNumpySemantics
 
 from pandas.io.json import ujson_dumps
 
@@ -237,7 +240,7 @@ def test_roundtrip_str_axes(self, orient, convert_axes, dtype):
 
     @pytest.mark.parametrize("convert_axes", [True, False])
     def test_roundtrip_categorical(
-        self, request, orient, categorical_frame, convert_axes
+        self, request, orient, categorical_frame, convert_axes, using_infer_string
     ):
         # TODO: create a better frame to test with and improve coverage
         if orient in ("index", "columns"):
@@ -251,7 +254,9 @@ def test_roundtrip_categorical(
         result = read_json(data, orient=orient, convert_axes=convert_axes)
 
         expected = categorical_frame.copy()
-        expected.index = expected.index.astype(str)  # Categorical not preserved
+        expected.index = expected.index.astype(
+            str if not using_infer_string else "string[pyarrow_numpy]"
+        )  # Categorical not preserved
         expected.index.name = None  # index names aren't preserved in JSON
         assert_json_roundtrip_equal(result, expected, orient)
 
@@ -517,9 +522,9 @@ def test_v12_compat(self, datapath):
         df_iso = df.drop(["modified"], axis=1)
         v12_iso_json = os.path.join(dirpath, "tsframe_iso_v012.json")
         df_unser_iso = read_json(v12_iso_json)
-        tm.assert_frame_equal(df_iso, df_unser_iso)
+        tm.assert_frame_equal(df_iso, df_unser_iso, check_column_type=False)
 
-    def test_blocks_compat_GH9037(self):
+    def test_blocks_compat_GH9037(self, using_infer_string):
         index = pd.date_range("20000101", periods=10, freq="h")
         # freq doesn't round-trip
         index = DatetimeIndex(list(index), freq=None)
@@ -603,7 +608,9 @@ def test_blocks_compat_GH9037(self):
         )
 
         # JSON deserialisation always creates unicode strings
-        df_mixed.columns = df_mixed.columns.astype(np.str_)
+        df_mixed.columns = df_mixed.columns.astype(
+            np.str_ if not using_infer_string else "string[pyarrow_numpy]"
+        )
         data = StringIO(df_mixed.to_json(orient="split"))
         df_roundtrip = read_json(data, orient="split")
         tm.assert_frame_equal(
@@ -675,16 +682,19 @@ def test_series_non_unique_index(self):
         unserialized = read_json(
             StringIO(s.to_json(orient="records")), orient="records", typ="series"
         )
-        tm.assert_numpy_array_equal(s.values, unserialized.values)
+        tm.assert_equal(s.values, unserialized.values)
 
     def test_series_default_orient(self, string_series):
         assert string_series.to_json() == string_series.to_json(orient="index")
 
-    def test_series_roundtrip_simple(self, orient, string_series):
+    def test_series_roundtrip_simple(self, orient, string_series, using_infer_string):
         data = StringIO(string_series.to_json(orient=orient))
         result = read_json(data, typ="series", orient=orient)
 
         expected = string_series
+        if using_infer_string and orient in ("split", "index", "columns"):
+            # These schemas don't contain dtypes, so we infer string
+            expected.index = expected.index.astype("string[pyarrow_numpy]")
         if orient in ("values", "records"):
             expected = expected.reset_index(drop=True)
         if orient != "split":
@@ -1458,6 +1468,9 @@ def test_from_json_to_json_table_dtypes(self):
         result = read_json(StringIO(dfjson), orient="table")
         tm.assert_frame_equal(result, expected)
 
+    # TODO: We are casting to string which coerces None to NaN before casting back
+    # to object, ending up with incorrect na values
+    @pytest.mark.xfail(using_pyarrow_string_dtype(), reason="incorrect na conversion")
     @pytest.mark.parametrize("orient", ["split", "records", "index", "columns"])
     def test_to_json_from_json_columns_dtypes(self, orient):
         # GH21892 GH33205
@@ -1715,6 +1728,11 @@ def test_to_json_indent(self, indent):
 
         assert result == expected
 
+    @pytest.mark.skipif(
+        using_pyarrow_string_dtype(),
+        reason="Adjust expected when infer_string is default, no bug here, "
+        "just a complicated parametrization",
+    )
     @pytest.mark.parametrize(
         "orient,expected",
         [
@@ -1990,7 +2008,9 @@ def test_json_uint64(self):
     @pytest.mark.parametrize(
         "orient", ["split", "records", "values", "index", "columns"]
     )
-    def test_read_json_dtype_backend(self, string_storage, dtype_backend, orient):
+    def test_read_json_dtype_backend(
+        self, string_storage, dtype_backend, orient, using_infer_string
+    ):
         # GH#50750
         pa = pytest.importorskip("pyarrow")
         df = DataFrame(
@@ -2006,7 +2026,10 @@ def test_read_json_dtype_backend(self, string_storage, dtype_backend, orient):
             }
         )
 
-        if string_storage == "python":
+        if using_infer_string:
+            string_array = ArrowStringArrayNumpySemantics(pa.array(["a", "b", "c"]))
+            string_array_na = ArrowStringArrayNumpySemantics(pa.array(["a", "b", None]))
+        elif string_storage == "python":
             string_array = StringArray(np.array(["a", "b", "c"], dtype=np.object_))
             string_array_na = StringArray(np.array(["a", "b", NA], dtype=np.object_))