@@ -575,6 +575,47 @@ def test_write_column_index_nonstring(self, pa):
575
575
msg = r"parquet must have string column names"
576
576
self .check_error_on_write (df , engine , ValueError , msg )
577
577
578
+ def test_use_nullable_dtypes (self , engine ):
579
+ import pyarrow .parquet as pq
580
+
581
+ if engine == "fastparquet" :
582
+ # We are manually disabling fastparquet's
583
+ # nullable dtype support pending discussion
584
+ pytest .skip ("Fastparquet nullable dtype support is disabled" )
585
+
586
+ table = pyarrow .table (
587
+ {
588
+ "a" : pyarrow .array ([1 , 2 , 3 , None ], "int64" ),
589
+ "b" : pyarrow .array ([1 , 2 , 3 , None ], "uint8" ),
590
+ "c" : pyarrow .array (["a" , "b" , "c" , None ]),
591
+ "d" : pyarrow .array ([True , False , True , None ]),
592
+ # Test that nullable dtypes used even in absence of nulls
593
+ "e" : pyarrow .array ([1 , 2 , 3 , 4 ], "int64" ),
594
+ }
595
+ )
596
+ with tm .ensure_clean () as path :
597
+ # write manually with pyarrow to write integers
598
+ pq .write_table (table , path )
599
+ result1 = read_parquet (path , engine = engine )
600
+ result2 = read_parquet (path , engine = engine , use_nullable_dtypes = True )
601
+
602
+ assert result1 ["a" ].dtype == np .dtype ("float64" )
603
+ expected = pd .DataFrame (
604
+ {
605
+ "a" : pd .array ([1 , 2 , 3 , None ], dtype = "Int64" ),
606
+ "b" : pd .array ([1 , 2 , 3 , None ], dtype = "UInt8" ),
607
+ "c" : pd .array (["a" , "b" , "c" , None ], dtype = "string" ),
608
+ "d" : pd .array ([True , False , True , None ], dtype = "boolean" ),
609
+ "e" : pd .array ([1 , 2 , 3 , 4 ], dtype = "Int64" ),
610
+ }
611
+ )
612
+ if engine == "fastparquet" :
613
+ # Fastparquet doesn't support string columns yet
614
+ # Only int and boolean
615
+ result2 = result2 .drop ("c" , axis = 1 )
616
+ expected = expected .drop ("c" , axis = 1 )
617
+ tm .assert_frame_equal (result2 , expected )
618
+
578
619
579
620
@pytest .mark .filterwarnings ("ignore:CategoricalBlock is deprecated:DeprecationWarning" )
580
621
class TestParquetPyArrow (Base ):
@@ -829,35 +870,6 @@ def test_additional_extension_types(self, pa):
829
870
)
830
871
check_round_trip (df , pa )
831
872
832
- @td .skip_if_no ("pyarrow" )
833
- def test_use_nullable_dtypes (self , pa ):
834
- import pyarrow .parquet as pq
835
-
836
- table = pyarrow .table (
837
- {
838
- "a" : pyarrow .array ([1 , 2 , 3 , None ], "int64" ),
839
- "b" : pyarrow .array ([1 , 2 , 3 , None ], "uint8" ),
840
- "c" : pyarrow .array (["a" , "b" , "c" , None ]),
841
- "d" : pyarrow .array ([True , False , True , None ]),
842
- }
843
- )
844
- with tm .ensure_clean () as path :
845
- # write manually with pyarrow to write integers
846
- pq .write_table (table , path )
847
- result1 = read_parquet (path )
848
- result2 = read_parquet (path , use_nullable_dtypes = True )
849
-
850
- assert result1 ["a" ].dtype == np .dtype ("float64" )
851
- expected = pd .DataFrame (
852
- {
853
- "a" : pd .array ([1 , 2 , 3 , None ], dtype = "Int64" ),
854
- "b" : pd .array ([1 , 2 , 3 , None ], dtype = "UInt8" ),
855
- "c" : pd .array (["a" , "b" , "c" , None ], dtype = "string" ),
856
- "d" : pd .array ([True , False , True , None ], dtype = "boolean" ),
857
- }
858
- )
859
- tm .assert_frame_equal (result2 , expected )
860
-
861
873
def test_timestamp_nanoseconds (self , pa ):
862
874
# with version 2.0, pyarrow defaults to writing the nanoseconds, so
863
875
# this should work without error
@@ -928,7 +940,9 @@ def test_duplicate_columns(self, fp):
928
940
def test_bool_with_none (self , fp ):
929
941
df = pd .DataFrame ({"a" : [True , None , False ]})
930
942
expected = pd .DataFrame ({"a" : [1.0 , np .nan , 0.0 ]}, dtype = "float16" )
931
- check_round_trip (df , fp , expected = expected )
943
+ # Fastparquet bug in 0.7.1 makes it so that this dtype becomes
944
+ # float64
945
+ check_round_trip (df , fp , expected = expected , check_dtype = False )
932
946
933
947
def test_unsupported (self , fp ):
934
948
@@ -1049,9 +1063,14 @@ def test_timezone_aware_index(self, fp, timezone_aware_date_list):
1049
1063
expected .index .name = "index"
1050
1064
check_round_trip (df , fp , expected = expected )
1051
1065
1052
- def test_use_nullable_dtypes_not_supported (self , fp ):
1066
+ def test_use_nullable_dtypes_not_supported (self , monkeypatch , fp ):
1053
1067
df = pd .DataFrame ({"a" : [1 , 2 ]})
1054
1068
1069
+ # This is supported now in fastparquet 0.7.1 and above actually
1070
+ # Still need to ensure that this raises in all versions below
1071
+ import fastparquet as fp
1072
+
1073
+ monkeypatch .setattr (fp , "__version__" , "0.4" )
1055
1074
with tm .ensure_clean () as path :
1056
1075
df .to_parquet (path )
1057
1076
with pytest .raises (ValueError , match = "not supported for the fastparquet" ):
0 commit comments