Skip to content

Commit 02324e6

Browse files
authored
Adjust tests in json folder for new string option (#56197)
* BUG: read_json not handling string dtype when converting to dates * Adjust tests in json folder for new string option
1 parent 00a0216 commit 02324e6

File tree

2 files changed

+51
-13
lines changed

2 files changed

+51
-13
lines changed

pandas/tests/io/json/test_json_table_schema.py

+19-4
Original file line numberDiff line numberDiff line change
@@ -56,7 +56,7 @@ def df_table():
5656

5757

5858
class TestBuildSchema:
59-
def test_build_table_schema(self, df_schema):
59+
def test_build_table_schema(self, df_schema, using_infer_string):
6060
result = build_table_schema(df_schema, version=False)
6161
expected = {
6262
"fields": [
@@ -68,6 +68,8 @@ def test_build_table_schema(self, df_schema):
6868
],
6969
"primaryKey": ["idx"],
7070
}
71+
if using_infer_string:
72+
expected["fields"][2] = {"name": "B", "type": "any", "extDtype": "string"}
7173
assert result == expected
7274
result = build_table_schema(df_schema)
7375
assert "pandas_version" in result
@@ -97,7 +99,7 @@ def test_series_unnamed(self):
9799
}
98100
assert result == expected
99101

100-
def test_multiindex(self, df_schema):
102+
def test_multiindex(self, df_schema, using_infer_string):
101103
df = df_schema
102104
idx = pd.MultiIndex.from_product([("a", "b"), (1, 2)])
103105
df.index = idx
@@ -114,6 +116,13 @@ def test_multiindex(self, df_schema):
114116
],
115117
"primaryKey": ["level_0", "level_1"],
116118
}
119+
if using_infer_string:
120+
expected["fields"][0] = {
121+
"name": "level_0",
122+
"type": "any",
123+
"extDtype": "string",
124+
}
125+
expected["fields"][3] = {"name": "B", "type": "any", "extDtype": "string"}
117126
assert result == expected
118127

119128
df.index.names = ["idx0", None]
@@ -156,7 +165,10 @@ def test_as_json_table_type_bool_data(self, bool_type):
156165
def test_as_json_table_type_date_data(self, date_data):
157166
assert as_json_table_type(date_data.dtype) == "datetime"
158167

159-
@pytest.mark.parametrize("str_data", [pd.Series(["a", "b"]), pd.Index(["a", "b"])])
168+
@pytest.mark.parametrize(
169+
"str_data",
170+
[pd.Series(["a", "b"], dtype=object), pd.Index(["a", "b"], dtype=object)],
171+
)
160172
def test_as_json_table_type_string_data(self, str_data):
161173
assert as_json_table_type(str_data.dtype) == "string"
162174

@@ -261,7 +273,7 @@ def test_read_json_from_to_json_results(self):
261273
tm.assert_frame_equal(result1, df)
262274
tm.assert_frame_equal(result2, df)
263275

264-
def test_to_json(self, df_table):
276+
def test_to_json(self, df_table, using_infer_string):
265277
df = df_table
266278
df.index.name = "idx"
267279
result = df.to_json(orient="table", date_format="iso")
@@ -292,6 +304,9 @@ def test_to_json(self, df_table):
292304
{"name": "H", "type": "datetime", "tz": "US/Central"},
293305
]
294306

307+
if using_infer_string:
308+
fields[2] = {"name": "B", "type": "any", "extDtype": "string"}
309+
295310
schema = {"fields": fields, "primaryKey": ["idx"]}
296311
data = [
297312
OrderedDict(

pandas/tests/io/json/test_pandas.py

+32-9
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,8 @@
1313
import numpy as np
1414
import pytest
1515

16+
from pandas._config import using_pyarrow_string_dtype
17+
1618
from pandas.compat import IS64
1719
import pandas.util._test_decorators as td
1820

@@ -30,6 +32,7 @@
3032
ArrowStringArray,
3133
StringArray,
3234
)
35+
from pandas.core.arrays.string_arrow import ArrowStringArrayNumpySemantics
3336

3437
from pandas.io.json import ujson_dumps
3538

@@ -237,7 +240,7 @@ def test_roundtrip_str_axes(self, orient, convert_axes, dtype):
237240

238241
@pytest.mark.parametrize("convert_axes", [True, False])
239242
def test_roundtrip_categorical(
240-
self, request, orient, categorical_frame, convert_axes
243+
self, request, orient, categorical_frame, convert_axes, using_infer_string
241244
):
242245
# TODO: create a better frame to test with and improve coverage
243246
if orient in ("index", "columns"):
@@ -251,7 +254,9 @@ def test_roundtrip_categorical(
251254
result = read_json(data, orient=orient, convert_axes=convert_axes)
252255

253256
expected = categorical_frame.copy()
254-
expected.index = expected.index.astype(str) # Categorical not preserved
257+
expected.index = expected.index.astype(
258+
str if not using_infer_string else "string[pyarrow_numpy]"
259+
) # Categorical not preserved
255260
expected.index.name = None # index names aren't preserved in JSON
256261
assert_json_roundtrip_equal(result, expected, orient)
257262

@@ -517,9 +522,9 @@ def test_v12_compat(self, datapath):
517522
df_iso = df.drop(["modified"], axis=1)
518523
v12_iso_json = os.path.join(dirpath, "tsframe_iso_v012.json")
519524
df_unser_iso = read_json(v12_iso_json)
520-
tm.assert_frame_equal(df_iso, df_unser_iso)
525+
tm.assert_frame_equal(df_iso, df_unser_iso, check_column_type=False)
521526

522-
def test_blocks_compat_GH9037(self):
527+
def test_blocks_compat_GH9037(self, using_infer_string):
523528
index = pd.date_range("20000101", periods=10, freq="h")
524529
# freq doesn't round-trip
525530
index = DatetimeIndex(list(index), freq=None)
@@ -603,7 +608,9 @@ def test_blocks_compat_GH9037(self):
603608
)
604609

605610
# JSON deserialisation always creates unicode strings
606-
df_mixed.columns = df_mixed.columns.astype(np.str_)
611+
df_mixed.columns = df_mixed.columns.astype(
612+
np.str_ if not using_infer_string else "string[pyarrow_numpy]"
613+
)
607614
data = StringIO(df_mixed.to_json(orient="split"))
608615
df_roundtrip = read_json(data, orient="split")
609616
tm.assert_frame_equal(
@@ -675,16 +682,19 @@ def test_series_non_unique_index(self):
675682
unserialized = read_json(
676683
StringIO(s.to_json(orient="records")), orient="records", typ="series"
677684
)
678-
tm.assert_numpy_array_equal(s.values, unserialized.values)
685+
tm.assert_equal(s.values, unserialized.values)
679686

680687
def test_series_default_orient(self, string_series):
681688
assert string_series.to_json() == string_series.to_json(orient="index")
682689

683-
def test_series_roundtrip_simple(self, orient, string_series):
690+
def test_series_roundtrip_simple(self, orient, string_series, using_infer_string):
684691
data = StringIO(string_series.to_json(orient=orient))
685692
result = read_json(data, typ="series", orient=orient)
686693

687694
expected = string_series
695+
if using_infer_string and orient in ("split", "index", "columns"):
696+
# These schemas don't contain dtypes, so we infer string
697+
expected.index = expected.index.astype("string[pyarrow_numpy]")
688698
if orient in ("values", "records"):
689699
expected = expected.reset_index(drop=True)
690700
if orient != "split":
@@ -1458,6 +1468,9 @@ def test_from_json_to_json_table_dtypes(self):
14581468
result = read_json(StringIO(dfjson), orient="table")
14591469
tm.assert_frame_equal(result, expected)
14601470

1471+
# TODO: We are casting to string which coerces None to NaN before casting back
1472+
# to object, ending up with incorrect na values
1473+
@pytest.mark.xfail(using_pyarrow_string_dtype(), reason="incorrect na conversion")
14611474
@pytest.mark.parametrize("orient", ["split", "records", "index", "columns"])
14621475
def test_to_json_from_json_columns_dtypes(self, orient):
14631476
# GH21892 GH33205
@@ -1715,6 +1728,11 @@ def test_to_json_indent(self, indent):
17151728

17161729
assert result == expected
17171730

1731+
@pytest.mark.skipif(
1732+
using_pyarrow_string_dtype(),
1733+
reason="Adjust expected when infer_string is default, no bug here, "
1734+
"just a complicated parametrization",
1735+
)
17181736
@pytest.mark.parametrize(
17191737
"orient,expected",
17201738
[
@@ -1990,7 +2008,9 @@ def test_json_uint64(self):
19902008
@pytest.mark.parametrize(
19912009
"orient", ["split", "records", "values", "index", "columns"]
19922010
)
1993-
def test_read_json_dtype_backend(self, string_storage, dtype_backend, orient):
2011+
def test_read_json_dtype_backend(
2012+
self, string_storage, dtype_backend, orient, using_infer_string
2013+
):
19942014
# GH#50750
19952015
pa = pytest.importorskip("pyarrow")
19962016
df = DataFrame(
@@ -2006,7 +2026,10 @@ def test_read_json_dtype_backend(self, string_storage, dtype_backend, orient):
20062026
}
20072027
)
20082028

2009-
if string_storage == "python":
2029+
if using_infer_string:
2030+
string_array = ArrowStringArrayNumpySemantics(pa.array(["a", "b", "c"]))
2031+
string_array_na = ArrowStringArrayNumpySemantics(pa.array(["a", "b", None]))
2032+
elif string_storage == "python":
20102033
string_array = StringArray(np.array(["a", "b", "c"], dtype=np.object_))
20112034
string_array_na = StringArray(np.array(["a", "b", NA], dtype=np.object_))
20122035

0 commit comments

Comments
 (0)