19
19
pa_version_under11p0 ,
20
20
pa_version_under13p0 ,
21
21
pa_version_under15p0 ,
22
+ pa_version_under19p0 ,
22
23
)
23
24
24
25
import pandas as pd
@@ -261,8 +262,10 @@ def test_invalid_engine(df_compat):
261
262
check_round_trip (df_compat , "foo" , "bar" )
262
263
263
264
264
- def test_options_py (df_compat , pa ):
265
+ def test_options_py (df_compat , pa , using_infer_string ):
265
266
# use the set option
267
+ if using_infer_string and not pa_version_under19p0 :
268
+ df_compat .columns = df_compat .columns .astype ("str" )
266
269
267
270
with pd .option_context ("io.parquet.engine" , "pyarrow" ):
268
271
check_round_trip (df_compat )
@@ -798,18 +801,21 @@ def test_unsupported_float16_cleanup(self, pa, path_type):
798
801
799
802
def test_categorical (self , pa ):
800
803
# supported in >= 0.7.0
801
- df = pd .DataFrame ()
802
- df ["a" ] = pd .Categorical (list ("abcdef" ))
803
-
804
- # test for null, out-of-order values, and unobserved category
805
- df ["b" ] = pd .Categorical (
806
- ["bar" , "foo" , "foo" , "bar" , None , "bar" ],
807
- dtype = pd .CategoricalDtype (["foo" , "bar" , "baz" ]),
808
- )
809
-
810
- # test for ordered flag
811
- df ["c" ] = pd .Categorical (
812
- ["a" , "b" , "c" , "a" , "c" , "b" ], categories = ["b" , "c" , "d" ], ordered = True
804
+ df = pd .DataFrame (
805
+ {
806
+ "a" : pd .Categorical (list ("abcdef" )),
807
+ # test for null, out-of-order values, and unobserved category
808
+ "b" : pd .Categorical (
809
+ ["bar" , "foo" , "foo" , "bar" , None , "bar" ],
810
+ dtype = pd .CategoricalDtype (["foo" , "bar" , "baz" ]),
811
+ ),
812
+ # test for ordered flag
813
+ "c" : pd .Categorical (
814
+ ["a" , "b" , "c" , "a" , "c" , "b" ],
815
+ categories = ["b" , "c" , "d" ],
816
+ ordered = True ,
817
+ ),
818
+ }
813
819
)
814
820
815
821
check_round_trip (df , pa )
@@ -878,11 +884,13 @@ def test_s3_roundtrip_for_dir(
878
884
repeat = 1 ,
879
885
)
880
886
881
- def test_read_file_like_obj_support (self , df_compat ):
887
+ def test_read_file_like_obj_support (self , df_compat , using_infer_string ):
882
888
pytest .importorskip ("pyarrow" )
883
889
buffer = BytesIO ()
884
890
df_compat .to_parquet (buffer )
885
891
df_from_buf = read_parquet (buffer )
892
+ if using_infer_string and not pa_version_under19p0 :
893
+ df_compat .columns = df_compat .columns .astype ("str" )
886
894
tm .assert_frame_equal (df_compat , df_from_buf )
887
895
888
896
def test_expand_user (self , df_compat , monkeypatch ):
@@ -949,7 +957,7 @@ def test_additional_extension_arrays(self, pa, using_infer_string):
949
957
"c" : pd .Series (["a" , None , "c" ], dtype = "string" ),
950
958
}
951
959
)
952
- if using_infer_string :
960
+ if using_infer_string and pa_version_under19p0 :
953
961
check_round_trip (df , pa , expected = df .astype ({"c" : "str" }))
954
962
else :
955
963
check_round_trip (df , pa )
@@ -963,7 +971,10 @@ def test_pyarrow_backed_string_array(self, pa, string_storage, using_infer_strin
963
971
df = pd .DataFrame ({"a" : pd .Series (["a" , None , "c" ], dtype = "string[pyarrow]" )})
964
972
with pd .option_context ("string_storage" , string_storage ):
965
973
if using_infer_string :
966
- expected = df .astype ("str" )
974
+ if pa_version_under19p0 :
975
+ expected = df .astype ("str" )
976
+ else :
977
+ expected = df .astype (f"string[{ string_storage } ]" )
967
978
expected .columns = expected .columns .astype ("str" )
968
979
else :
969
980
expected = df .astype (f"string[{ string_storage } ]" )
@@ -1128,17 +1139,24 @@ def test_df_attrs_persistence(self, tmp_path, pa):
1128
1139
new_df = read_parquet (path , engine = pa )
1129
1140
assert new_df .attrs == df .attrs
1130
1141
1131
- def test_string_inference (self , tmp_path , pa ):
1142
+ def test_string_inference (self , tmp_path , pa , using_infer_string ):
1132
1143
# GH#54431
1133
1144
path = tmp_path / "test_string_inference.p"
1134
1145
df = pd .DataFrame (data = {"a" : ["x" , "y" ]}, index = ["a" , "b" ])
1135
- df .to_parquet (path , engine = "pyarrow" )
1146
+ df .to_parquet (path , engine = pa )
1136
1147
with pd .option_context ("future.infer_string" , True ):
1137
- result = read_parquet (path , engine = "pyarrow" )
1148
+ result = read_parquet (path , engine = pa )
1149
+ dtype = pd .StringDtype (na_value = np .nan )
1138
1150
expected = pd .DataFrame (
1139
1151
data = {"a" : ["x" , "y" ]},
1140
- dtype = pd .StringDtype (na_value = np .nan ),
1141
- index = pd .Index (["a" , "b" ], dtype = pd .StringDtype (na_value = np .nan )),
1152
+ dtype = dtype ,
1153
+ index = pd .Index (["a" , "b" ], dtype = dtype ),
1154
+ columns = pd .Index (
1155
+ ["a" ],
1156
+ dtype = object
1157
+ if pa_version_under19p0 and not using_infer_string
1158
+ else dtype ,
1159
+ ),
1142
1160
)
1143
1161
tm .assert_frame_equal (result , expected )
1144
1162
@@ -1151,7 +1169,10 @@ def test_roundtrip_decimal(self, tmp_path, pa):
1151
1169
df = pd .DataFrame ({"a" : [Decimal ("123.00" )]}, dtype = "string[pyarrow]" )
1152
1170
df .to_parquet (path , schema = pa .schema ([("a" , pa .decimal128 (5 ))]))
1153
1171
result = read_parquet (path )
1154
- expected = pd .DataFrame ({"a" : ["123" ]}, dtype = "string[python]" )
1172
+ if pa_version_under19p0 :
1173
+ expected = pd .DataFrame ({"a" : ["123" ]}, dtype = "string[python]" )
1174
+ else :
1175
+ expected = pd .DataFrame ({"a" : [Decimal ("123.00" )]}, dtype = "object" )
1155
1176
tm .assert_frame_equal (result , expected )
1156
1177
1157
1178
def test_infer_string_large_string_type (self , tmp_path , pa ):
0 commit comments