13
13
import numpy as np
14
14
import pytest
15
15
16
+ from pandas ._config import using_pyarrow_string_dtype
17
+
16
18
from pandas .compat import IS64
17
19
import pandas .util ._test_decorators as td
18
20
30
32
ArrowStringArray ,
31
33
StringArray ,
32
34
)
35
+ from pandas .core .arrays .string_arrow import ArrowStringArrayNumpySemantics
33
36
34
37
from pandas .io .json import ujson_dumps
35
38
@@ -237,7 +240,7 @@ def test_roundtrip_str_axes(self, orient, convert_axes, dtype):
237
240
238
241
@pytest .mark .parametrize ("convert_axes" , [True , False ])
239
242
def test_roundtrip_categorical (
240
- self , request , orient , categorical_frame , convert_axes
243
+ self , request , orient , categorical_frame , convert_axes , using_infer_string
241
244
):
242
245
# TODO: create a better frame to test with and improve coverage
243
246
if orient in ("index" , "columns" ):
@@ -251,7 +254,9 @@ def test_roundtrip_categorical(
251
254
result = read_json (data , orient = orient , convert_axes = convert_axes )
252
255
253
256
expected = categorical_frame .copy ()
254
- expected .index = expected .index .astype (str ) # Categorical not preserved
257
+ expected .index = expected .index .astype (
258
+ str if not using_infer_string else "string[pyarrow_numpy]"
259
+ ) # Categorical not preserved
255
260
expected .index .name = None # index names aren't preserved in JSON
256
261
assert_json_roundtrip_equal (result , expected , orient )
257
262
@@ -517,9 +522,9 @@ def test_v12_compat(self, datapath):
517
522
df_iso = df .drop (["modified" ], axis = 1 )
518
523
v12_iso_json = os .path .join (dirpath , "tsframe_iso_v012.json" )
519
524
df_unser_iso = read_json (v12_iso_json )
520
- tm .assert_frame_equal (df_iso , df_unser_iso )
525
+ tm .assert_frame_equal (df_iso , df_unser_iso , check_column_type = False )
521
526
522
- def test_blocks_compat_GH9037 (self ):
527
+ def test_blocks_compat_GH9037 (self , using_infer_string ):
523
528
index = pd .date_range ("20000101" , periods = 10 , freq = "h" )
524
529
# freq doesn't round-trip
525
530
index = DatetimeIndex (list (index ), freq = None )
@@ -603,7 +608,9 @@ def test_blocks_compat_GH9037(self):
603
608
)
604
609
605
610
# JSON deserialisation always creates unicode strings
606
- df_mixed .columns = df_mixed .columns .astype (np .str_ )
611
+ df_mixed .columns = df_mixed .columns .astype (
612
+ np .str_ if not using_infer_string else "string[pyarrow_numpy]"
613
+ )
607
614
data = StringIO (df_mixed .to_json (orient = "split" ))
608
615
df_roundtrip = read_json (data , orient = "split" )
609
616
tm .assert_frame_equal (
@@ -675,16 +682,19 @@ def test_series_non_unique_index(self):
675
682
unserialized = read_json (
676
683
StringIO (s .to_json (orient = "records" )), orient = "records" , typ = "series"
677
684
)
678
- tm .assert_numpy_array_equal (s .values , unserialized .values )
685
+ tm .assert_equal (s .values , unserialized .values )
679
686
680
687
def test_series_default_orient (self , string_series ):
681
688
assert string_series .to_json () == string_series .to_json (orient = "index" )
682
689
683
- def test_series_roundtrip_simple (self , orient , string_series ):
690
+ def test_series_roundtrip_simple (self , orient , string_series , using_infer_string ):
684
691
data = StringIO (string_series .to_json (orient = orient ))
685
692
result = read_json (data , typ = "series" , orient = orient )
686
693
687
694
expected = string_series
695
+ if using_infer_string and orient in ("split" , "index" , "columns" ):
696
+ # These schemas don't contain dtypes, so we infer string
697
+ expected .index = expected .index .astype ("string[pyarrow_numpy]" )
688
698
if orient in ("values" , "records" ):
689
699
expected = expected .reset_index (drop = True )
690
700
if orient != "split" :
@@ -1458,6 +1468,9 @@ def test_from_json_to_json_table_dtypes(self):
1458
1468
result = read_json (StringIO (dfjson ), orient = "table" )
1459
1469
tm .assert_frame_equal (result , expected )
1460
1470
1471
+ # TODO: We are casting to string which coerces None to NaN before casting back
1472
+ # to object, ending up with incorrect na values
1473
+ @pytest .mark .xfail (using_pyarrow_string_dtype (), reason = "incorrect na conversion" )
1461
1474
@pytest .mark .parametrize ("orient" , ["split" , "records" , "index" , "columns" ])
1462
1475
def test_to_json_from_json_columns_dtypes (self , orient ):
1463
1476
# GH21892 GH33205
@@ -1715,6 +1728,11 @@ def test_to_json_indent(self, indent):
1715
1728
1716
1729
assert result == expected
1717
1730
1731
+ @pytest .mark .skipif (
1732
+ using_pyarrow_string_dtype (),
1733
+ reason = "Adjust expected when infer_string is default, no bug here, "
1734
+ "just a complicated parametrization" ,
1735
+ )
1718
1736
@pytest .mark .parametrize (
1719
1737
"orient,expected" ,
1720
1738
[
@@ -1990,7 +2008,9 @@ def test_json_uint64(self):
1990
2008
@pytest .mark .parametrize (
1991
2009
"orient" , ["split" , "records" , "values" , "index" , "columns" ]
1992
2010
)
1993
- def test_read_json_dtype_backend (self , string_storage , dtype_backend , orient ):
2011
+ def test_read_json_dtype_backend (
2012
+ self , string_storage , dtype_backend , orient , using_infer_string
2013
+ ):
1994
2014
# GH#50750
1995
2015
pa = pytest .importorskip ("pyarrow" )
1996
2016
df = DataFrame (
@@ -2006,7 +2026,10 @@ def test_read_json_dtype_backend(self, string_storage, dtype_backend, orient):
2006
2026
}
2007
2027
)
2008
2028
2009
- if string_storage == "python" :
2029
+ if using_infer_string :
2030
+ string_array = ArrowStringArrayNumpySemantics (pa .array (["a" , "b" , "c" ]))
2031
+ string_array_na = ArrowStringArrayNumpySemantics (pa .array (["a" , "b" , None ]))
2032
+ elif string_storage == "python" :
2010
2033
string_array = StringArray (np .array (["a" , "b" , "c" ], dtype = np .object_ ))
2011
2034
string_array_na = StringArray (np .array (["a" , "b" , NA ], dtype = np .object_ ))
2012
2035
0 commit comments