@@ -575,6 +575,46 @@ def test_write_column_index_nonstring(self, pa):
575
575
msg = r"parquet must have string column names"
576
576
self .check_error_on_write (df , engine , ValueError , msg )
577
577
578
+ def test_use_nullable_dtypes (self , engine ):
579
+ import pyarrow .parquet as pq
580
+
581
+ if engine == "fastparquet" :
582
+ pytest .importorskip (
583
+ "fastparquet" ,
584
+ "0.7.1" ,
585
+ reason = "fastparquet must be 0.7.1 or higher for nullable dtype support" ,
586
+ )
587
+
588
+ table = pyarrow .table (
589
+ {
590
+ "a" : pyarrow .array ([1 , 2 , 3 , None ], "int64" ),
591
+ "b" : pyarrow .array ([1 , 2 , 3 , None ], "uint8" ),
592
+ "c" : pyarrow .array (["a" , "b" , "c" , None ]),
593
+ "d" : pyarrow .array ([True , False , True , None ]),
594
+ }
595
+ )
596
+ with tm .ensure_clean () as path :
597
+ # write manually with pyarrow to write integers
598
+ pq .write_table (table , path )
599
+ result1 = read_parquet (path , engine = engine )
600
+ result2 = read_parquet (path , engine = engine , use_nullable_dtypes = True )
601
+
602
+ assert result1 ["a" ].dtype == np .dtype ("float64" )
603
+ expected = pd .DataFrame (
604
+ {
605
+ "a" : pd .array ([1 , 2 , 3 , None ], dtype = "Int64" ),
606
+ "b" : pd .array ([1 , 2 , 3 , None ], dtype = "UInt8" ),
607
+ "c" : pd .array (["a" , "b" , "c" , None ], dtype = "string" ),
608
+ "d" : pd .array ([True , False , True , None ], dtype = "boolean" ),
609
+ }
610
+ )
611
+ if engine == "fastparquet" :
612
+ # Fastparquet doesn't support string columns yet
613
+ # Only int and boolean
614
+ result2 = result2 .drop ("c" , axis = 1 )
615
+ expected = expected .drop ("c" , axis = 1 )
616
+ tm .assert_frame_equal (result2 , expected )
617
+
578
618
579
619
@pytest .mark .filterwarnings ("ignore:CategoricalBlock is deprecated:DeprecationWarning" )
580
620
class TestParquetPyArrow (Base ):
@@ -829,35 +869,6 @@ def test_additional_extension_types(self, pa):
829
869
)
830
870
check_round_trip (df , pa )
831
871
832
- @td .skip_if_no ("pyarrow" )
833
- def test_use_nullable_dtypes (self , pa ):
834
- import pyarrow .parquet as pq
835
-
836
- table = pyarrow .table (
837
- {
838
- "a" : pyarrow .array ([1 , 2 , 3 , None ], "int64" ),
839
- "b" : pyarrow .array ([1 , 2 , 3 , None ], "uint8" ),
840
- "c" : pyarrow .array (["a" , "b" , "c" , None ]),
841
- "d" : pyarrow .array ([True , False , True , None ]),
842
- }
843
- )
844
- with tm .ensure_clean () as path :
845
- # write manually with pyarrow to write integers
846
- pq .write_table (table , path )
847
- result1 = read_parquet (path )
848
- result2 = read_parquet (path , use_nullable_dtypes = True )
849
-
850
- assert result1 ["a" ].dtype == np .dtype ("float64" )
851
- expected = pd .DataFrame (
852
- {
853
- "a" : pd .array ([1 , 2 , 3 , None ], dtype = "Int64" ),
854
- "b" : pd .array ([1 , 2 , 3 , None ], dtype = "UInt8" ),
855
- "c" : pd .array (["a" , "b" , "c" , None ], dtype = "string" ),
856
- "d" : pd .array ([True , False , True , None ], dtype = "boolean" ),
857
- }
858
- )
859
- tm .assert_frame_equal (result2 , expected )
860
-
861
872
def test_timestamp_nanoseconds (self , pa ):
862
873
# with version 2.0, pyarrow defaults to writing the nanoseconds, so
863
874
# this should work without error
@@ -928,7 +939,9 @@ def test_duplicate_columns(self, fp):
928
939
def test_bool_with_none (self , fp ):
929
940
df = pd .DataFrame ({"a" : [True , None , False ]})
930
941
expected = pd .DataFrame ({"a" : [1.0 , np .nan , 0.0 ]}, dtype = "float16" )
931
- check_round_trip (df , fp , expected = expected )
942
+ # Fastparquet bug in 0.7.1 makes it so that this dtype becomes
943
+ # float64
944
+ check_round_trip (df , fp , expected = expected , check_dtype = False )
932
945
933
946
def test_unsupported (self , fp ):
934
947
@@ -1049,9 +1062,14 @@ def test_timezone_aware_index(self, fp, timezone_aware_date_list):
1049
1062
expected .index .name = "index"
1050
1063
check_round_trip (df , fp , expected = expected )
1051
1064
1052
- def test_use_nullable_dtypes_not_supported (self , fp ):
1065
+ def test_use_nullable_dtypes_not_supported (self , monkeypatch , fp ):
1053
1066
df = pd .DataFrame ({"a" : [1 , 2 ]})
1054
1067
1068
+ # This is supported now in fastparquet 0.7.1 and above actually
1069
+ # Still need to ensure that this raises in all versions below
1070
+ import fastparquet as fp
1071
+
1072
+ monkeypatch .setattr (fp , "__version__" , "0.4" )
1055
1073
with tm .ensure_clean () as path :
1056
1074
df .to_parquet (path )
1057
1075
with pytest .raises (ValueError , match = "not supported for the fastparquet" ):
0 commit comments