@@ -596,6 +596,46 @@ def test_write_column_index_nonstring(self, pa):
596
596
msg = r"parquet must have string column names"
597
597
self .check_error_on_write (df , engine , ValueError , msg )
598
598
599
+ def test_use_nullable_dtypes (self , engine ):
600
+ import pyarrow .parquet as pq
601
+
602
+ if engine == "fastparquet" :
603
+ pytest .importorskip (
604
+ "fastparquet" ,
605
+ "0.7.1" ,
606
+ reason = "fastparquet must be 0.7.1 or higher for nullable dtype support" ,
607
+ )
608
+
609
+ table = pyarrow .table (
610
+ {
611
+ "a" : pyarrow .array ([1 , 2 , 3 , None ], "int64" ),
612
+ "b" : pyarrow .array ([1 , 2 , 3 , None ], "uint8" ),
613
+ "c" : pyarrow .array (["a" , "b" , "c" , None ]),
614
+ "d" : pyarrow .array ([True , False , True , None ]),
615
+ }
616
+ )
617
+ with tm .ensure_clean () as path :
618
+ # write manually with pyarrow to write integers
619
+ pq .write_table (table , path )
620
+ result1 = read_parquet (path , engine = engine )
621
+ result2 = read_parquet (path , engine = engine , use_nullable_dtypes = True )
622
+
623
+ assert result1 ["a" ].dtype == np .dtype ("float64" )
624
+ expected = pd .DataFrame (
625
+ {
626
+ "a" : pd .array ([1 , 2 , 3 , None ], dtype = "Int64" ),
627
+ "b" : pd .array ([1 , 2 , 3 , None ], dtype = "UInt8" ),
628
+ "c" : pd .array (["a" , "b" , "c" , None ], dtype = "string" ),
629
+ "d" : pd .array ([True , False , True , None ], dtype = "boolean" ),
630
+ }
631
+ )
632
+ if engine == "fastparquet" :
633
+ # Fastparquet doesn't support string columns yet
634
+ # Only int and boolean
635
+ result2 = result2 .drop ("c" , axis = 1 )
636
+ expected = expected .drop ("c" , axis = 1 )
637
+ tm .assert_frame_equal (result2 , expected )
638
+
599
639
600
640
@pytest .mark .filterwarnings ("ignore:CategoricalBlock is deprecated:DeprecationWarning" )
601
641
class TestParquetPyArrow (Base ):
@@ -842,35 +882,6 @@ def test_additional_extension_types(self, pa):
842
882
)
843
883
check_round_trip (df , pa )
844
884
845
- @td .skip_if_no ("pyarrow" )
846
- def test_use_nullable_dtypes (self , pa ):
847
- import pyarrow .parquet as pq
848
-
849
- table = pyarrow .table (
850
- {
851
- "a" : pyarrow .array ([1 , 2 , 3 , None ], "int64" ),
852
- "b" : pyarrow .array ([1 , 2 , 3 , None ], "uint8" ),
853
- "c" : pyarrow .array (["a" , "b" , "c" , None ]),
854
- "d" : pyarrow .array ([True , False , True , None ]),
855
- }
856
- )
857
- with tm .ensure_clean () as path :
858
- # write manually with pyarrow to write integers
859
- pq .write_table (table , path )
860
- result1 = read_parquet (path )
861
- result2 = read_parquet (path , use_nullable_dtypes = True )
862
-
863
- assert result1 ["a" ].dtype == np .dtype ("float64" )
864
- expected = pd .DataFrame (
865
- {
866
- "a" : pd .array ([1 , 2 , 3 , None ], dtype = "Int64" ),
867
- "b" : pd .array ([1 , 2 , 3 , None ], dtype = "UInt8" ),
868
- "c" : pd .array (["a" , "b" , "c" , None ], dtype = "string" ),
869
- "d" : pd .array ([True , False , True , None ], dtype = "boolean" ),
870
- }
871
- )
872
- tm .assert_frame_equal (result2 , expected )
873
-
874
885
def test_timestamp_nanoseconds (self , pa ):
875
886
# with version 2.0, pyarrow defaults to writing the nanoseconds, so
876
887
# this should work without error
@@ -941,7 +952,9 @@ def test_duplicate_columns(self, fp):
941
952
def test_bool_with_none (self , fp ):
942
953
df = pd .DataFrame ({"a" : [True , None , False ]})
943
954
expected = pd .DataFrame ({"a" : [1.0 , np .nan , 0.0 ]}, dtype = "float16" )
944
- check_round_trip (df , fp , expected = expected )
955
+ # Fastparquet bug in 0.7.1 makes it so that this dtype becomes
956
+ # float64
957
+ check_round_trip (df , fp , expected = expected , check_dtype = False )
945
958
946
959
def test_unsupported (self , fp ):
947
960
@@ -1062,9 +1075,14 @@ def test_timezone_aware_index(self, fp, timezone_aware_date_list):
1062
1075
expected .index .name = "index"
1063
1076
check_round_trip (df , fp , expected = expected )
1064
1077
1065
- def test_use_nullable_dtypes_not_supported (self , fp ):
1078
+ def test_use_nullable_dtypes_not_supported (self , monkeypatch , fp ):
1066
1079
df = pd .DataFrame ({"a" : [1 , 2 ]})
1067
1080
1081
+ # This is supported now in fastparquet 0.7.1 and above actually
1082
+ # Still need to ensure that this raises in all versions below
1083
+ import fastparquet as fp
1084
+
1085
+ monkeypatch .setattr (fp , "__version__" , "0.4" )
1068
1086
with tm .ensure_clean () as path :
1069
1087
df .to_parquet (path )
1070
1088
with pytest .raises (ValueError , match = "not supported for the fastparquet" ):
0 commit comments