19
19
pa_version_under11p0 ,
20
20
pa_version_under13p0 ,
21
21
pa_version_under15p0 ,
22
+ pa_version_under19p0 ,
22
23
)
23
24
24
25
import pandas as pd
@@ -110,10 +111,7 @@ def fp(request):
110
111
111
112
@pytest .fixture
112
113
def df_compat ():
113
- # TODO(infer_string) should this give str columns?
114
- return pd .DataFrame (
115
- {"A" : [1 , 2 , 3 ], "B" : "foo" }, columns = pd .Index (["A" , "B" ], dtype = object )
116
- )
114
+ return pd .DataFrame ({"A" : [1 , 2 , 3 ], "B" : "foo" }, columns = pd .Index (["A" , "B" ]))
117
115
118
116
119
117
@pytest .fixture
@@ -261,8 +259,10 @@ def test_invalid_engine(df_compat):
261
259
check_round_trip (df_compat , "foo" , "bar" )
262
260
263
261
264
- def test_options_py (df_compat , pa ):
262
+ def test_options_py (df_compat , pa , using_infer_string ):
265
263
# use the set option
264
+ if using_infer_string and not pa_version_under19p0 :
265
+ df_compat .columns = df_compat .columns .astype ("str" )
266
266
267
267
with pd .option_context ("io.parquet.engine" , "pyarrow" ):
268
268
check_round_trip (df_compat )
@@ -798,18 +798,21 @@ def test_unsupported_float16_cleanup(self, pa, path_type):
798
798
799
799
def test_categorical (self , pa ):
800
800
# supported in >= 0.7.0
801
- df = pd .DataFrame ()
802
- df ["a" ] = pd .Categorical (list ("abcdef" ))
803
-
804
- # test for null, out-of-order values, and unobserved category
805
- df ["b" ] = pd .Categorical (
806
- ["bar" , "foo" , "foo" , "bar" , None , "bar" ],
807
- dtype = pd .CategoricalDtype (["foo" , "bar" , "baz" ]),
808
- )
809
-
810
- # test for ordered flag
811
- df ["c" ] = pd .Categorical (
812
- ["a" , "b" , "c" , "a" , "c" , "b" ], categories = ["b" , "c" , "d" ], ordered = True
801
+ df = pd .DataFrame (
802
+ {
803
+ "a" : pd .Categorical (list ("abcdef" )),
804
+ # test for null, out-of-order values, and unobserved category
805
+ "b" : pd .Categorical (
806
+ ["bar" , "foo" , "foo" , "bar" , None , "bar" ],
807
+ dtype = pd .CategoricalDtype (["foo" , "bar" , "baz" ]),
808
+ ),
809
+ # test for ordered flag
810
+ "c" : pd .Categorical (
811
+ ["a" , "b" , "c" , "a" , "c" , "b" ],
812
+ categories = ["b" , "c" , "d" ],
813
+ ordered = True ,
814
+ ),
815
+ }
813
816
)
814
817
815
818
check_round_trip (df , pa )
@@ -878,11 +881,13 @@ def test_s3_roundtrip_for_dir(
878
881
repeat = 1 ,
879
882
)
880
883
881
- def test_read_file_like_obj_support (self , df_compat ):
884
+ def test_read_file_like_obj_support (self , df_compat , using_infer_string ):
882
885
pytest .importorskip ("pyarrow" )
883
886
buffer = BytesIO ()
884
887
df_compat .to_parquet (buffer )
885
888
df_from_buf = read_parquet (buffer )
889
+ if using_infer_string and not pa_version_under19p0 :
890
+ df_compat .columns = df_compat .columns .astype ("str" )
886
891
tm .assert_frame_equal (df_compat , df_from_buf )
887
892
888
893
def test_expand_user (self , df_compat , monkeypatch ):
@@ -949,7 +954,7 @@ def test_additional_extension_arrays(self, pa, using_infer_string):
949
954
"c" : pd .Series (["a" , None , "c" ], dtype = "string" ),
950
955
}
951
956
)
952
- if using_infer_string :
957
+ if using_infer_string and pa_version_under19p0 :
953
958
check_round_trip (df , pa , expected = df .astype ({"c" : "str" }))
954
959
else :
955
960
check_round_trip (df , pa )
@@ -963,7 +968,10 @@ def test_pyarrow_backed_string_array(self, pa, string_storage, using_infer_strin
963
968
df = pd .DataFrame ({"a" : pd .Series (["a" , None , "c" ], dtype = "string[pyarrow]" )})
964
969
with pd .option_context ("string_storage" , string_storage ):
965
970
if using_infer_string :
966
- expected = df .astype ("str" )
971
+ if pa_version_under19p0 :
972
+ expected = df .astype ("str" )
973
+ else :
974
+ expected = df .astype (f"string[{ string_storage } ]" )
967
975
expected .columns = expected .columns .astype ("str" )
968
976
else :
969
977
expected = df .astype (f"string[{ string_storage } ]" )
@@ -1128,17 +1136,24 @@ def test_df_attrs_persistence(self, tmp_path, pa):
1128
1136
new_df = read_parquet (path , engine = pa )
1129
1137
assert new_df .attrs == df .attrs
1130
1138
1131
- def test_string_inference (self , tmp_path , pa ):
1139
+ def test_string_inference (self , tmp_path , pa , using_infer_string ):
1132
1140
# GH#54431
1133
1141
path = tmp_path / "test_string_inference.p"
1134
1142
df = pd .DataFrame (data = {"a" : ["x" , "y" ]}, index = ["a" , "b" ])
1135
- df .to_parquet (path , engine = "pyarrow" )
1143
+ df .to_parquet (path , engine = pa )
1136
1144
with pd .option_context ("future.infer_string" , True ):
1137
- result = read_parquet (path , engine = "pyarrow" )
1145
+ result = read_parquet (path , engine = pa )
1146
+ dtype = pd .StringDtype (na_value = np .nan )
1138
1147
expected = pd .DataFrame (
1139
1148
data = {"a" : ["x" , "y" ]},
1140
- dtype = pd .StringDtype (na_value = np .nan ),
1141
- index = pd .Index (["a" , "b" ], dtype = pd .StringDtype (na_value = np .nan )),
1149
+ dtype = dtype ,
1150
+ index = pd .Index (["a" , "b" ], dtype = dtype ),
1151
+ columns = pd .Index (
1152
+ ["a" ],
1153
+ dtype = object
1154
+ if pa_version_under19p0 and not using_infer_string
1155
+ else dtype ,
1156
+ ),
1142
1157
)
1143
1158
tm .assert_frame_equal (result , expected )
1144
1159
@@ -1151,7 +1166,10 @@ def test_roundtrip_decimal(self, tmp_path, pa):
1151
1166
df = pd .DataFrame ({"a" : [Decimal ("123.00" )]}, dtype = "string[pyarrow]" )
1152
1167
df .to_parquet (path , schema = pa .schema ([("a" , pa .decimal128 (5 ))]))
1153
1168
result = read_parquet (path )
1154
- expected = pd .DataFrame ({"a" : ["123" ]}, dtype = "string[python]" )
1169
+ if pa_version_under19p0 :
1170
+ expected = pd .DataFrame ({"a" : ["123" ]}, dtype = "string[python]" )
1171
+ else :
1172
+ expected = pd .DataFrame ({"a" : [Decimal ("123.00" )]}, dtype = "object" )
1155
1173
tm .assert_frame_equal (result , expected )
1156
1174
1157
1175
def test_infer_string_large_string_type (self , tmp_path , pa ):
0 commit comments