@@ -404,25 +404,6 @@ def test_columns_dtypes(self, engine):
404
404
df .columns = ["foo" , "bar" ]
405
405
check_round_trip (df , engine )
406
406
407
- def test_columns_dtypes_invalid (self , engine ):
408
- df = pd .DataFrame ({"string" : list ("abc" ), "int" : list (range (1 , 4 ))})
409
-
410
- msg = "parquet must have string column names"
411
- # numeric
412
- df .columns = [0 , 1 ]
413
- self .check_error_on_write (df , engine , ValueError , msg )
414
-
415
- # bytes
416
- df .columns = [b"foo" , b"bar" ]
417
- self .check_error_on_write (df , engine , ValueError , msg )
418
-
419
- # python object
420
- df .columns = [
421
- datetime .datetime (2011 , 1 , 1 , 0 , 0 ),
422
- datetime .datetime (2011 , 1 , 1 , 1 , 1 ),
423
- ]
424
- self .check_error_on_write (df , engine , ValueError , msg )
425
-
426
407
@pytest .mark .parametrize ("compression" , [None , "gzip" , "snappy" , "brotli" ])
427
408
def test_compression (self , engine , compression ):
428
409
if compression == "snappy" :
@@ -528,16 +509,16 @@ def test_write_column_multiindex(self, engine):
528
509
# Not able to write column multi-indexes with non-string column names.
529
510
mi_columns = pd .MultiIndex .from_tuples ([("a" , 1 ), ("a" , 2 ), ("b" , 1 )])
530
511
df = pd .DataFrame (np .random .randn (4 , 3 ), columns = mi_columns )
531
- msg = (
532
- r"\s*parquet must have string column names for all values in\s*"
533
- "each level of the MultiIndex"
534
- )
535
- self .check_error_on_write (df , engine , ValueError , msg )
536
512
537
- def test_write_column_multiindex_nonstring (self , pa ):
513
+ if engine == "fastparquet" :
514
+ self .check_error_on_write (
515
+ df , engine , TypeError , "Column name must be a string"
516
+ )
517
+ elif engine == "pyarrow" :
518
+ check_round_trip (df , engine )
519
+
520
+ def test_write_column_multiindex_nonstring (self , engine ):
538
521
# GH #34777
539
- # Not supported in fastparquet as of 0.1.3
540
- engine = pa
541
522
542
523
# Not able to write column multi-indexes with non-string column names
543
524
arrays = [
@@ -546,11 +527,14 @@ def test_write_column_multiindex_nonstring(self, pa):
546
527
]
547
528
df = pd .DataFrame (np .random .randn (8 , 8 ), columns = arrays )
548
529
df .columns .names = ["Level1" , "Level2" ]
549
- msg = (
550
- r"\s*parquet must have string column names for all values in\s*"
551
- "each level of the MultiIndex"
552
- )
553
- self .check_error_on_write (df , engine , ValueError , msg )
530
+ if engine == "fastparquet" :
531
+ if Version (fastparquet .__version__ ) < Version ("0.7.0" ):
532
+ err = TypeError
533
+ else :
534
+ err = ValueError
535
+ self .check_error_on_write (df , engine , err , "Column name" )
536
+ elif engine == "pyarrow" :
537
+ check_round_trip (df , engine )
554
538
555
539
def test_write_column_multiindex_string (self , pa ):
556
540
# GH #34777
@@ -579,17 +563,19 @@ def test_write_column_index_string(self, pa):
579
563
580
564
check_round_trip (df , engine )
581
565
582
- def test_write_column_index_nonstring (self , pa ):
566
+ def test_write_column_index_nonstring (self , engine ):
583
567
# GH #34777
584
- # Not supported in fastparquet as of 0.1.3
585
- engine = pa
586
568
587
569
# Write column indexes with string column names
588
570
arrays = [1 , 2 , 3 , 4 ]
589
571
df = pd .DataFrame (np .random .randn (8 , 4 ), columns = arrays )
590
572
df .columns .name = "NonStringCol"
591
- msg = r"parquet must have string column names"
592
- self .check_error_on_write (df , engine , ValueError , msg )
573
+ if engine == "fastparquet" :
574
+ self .check_error_on_write (
575
+ df , engine , TypeError , "Column name must be a string"
576
+ )
577
+ else :
578
+ check_round_trip (df , engine )
593
579
594
580
@pytest .mark .skipif (pa_version_under7p0 , reason = "minimum pyarrow not installed" )
595
581
def test_dtype_backend (self , engine , request ):
@@ -1041,6 +1027,31 @@ def test_read_dtype_backend_pyarrow_config_index(self, pa):
1041
1027
expected = expected ,
1042
1028
)
1043
1029
1030
+ def test_columns_dtypes_not_invalid (self , pa ):
1031
+ df = pd .DataFrame ({"string" : list ("abc" ), "int" : list (range (1 , 4 ))})
1032
+
1033
+ # numeric
1034
+ df .columns = [0 , 1 ]
1035
+ check_round_trip (df , pa )
1036
+
1037
+ # bytes
1038
+ df .columns = [b"foo" , b"bar" ]
1039
+ with pytest .raises (NotImplementedError , match = "|S3" ):
1040
+ # Bytes fails on read_parquet
1041
+ check_round_trip (df , pa )
1042
+
1043
+ # python object
1044
+ df .columns = [
1045
+ datetime .datetime (2011 , 1 , 1 , 0 , 0 ),
1046
+ datetime .datetime (2011 , 1 , 1 , 1 , 1 ),
1047
+ ]
1048
+ check_round_trip (df , pa )
1049
+
1050
+ def test_empty_columns (self , pa ):
1051
+ # GH 52034
1052
+ df = pd .DataFrame (index = pd .Index (["a" , "b" , "c" ], name = "custom name" ))
1053
+ check_round_trip (df , pa )
1054
+
1044
1055
1045
1056
class TestParquetFastParquet (Base ):
1046
1057
def test_basic (self , fp , df_full ):
@@ -1052,6 +1063,27 @@ def test_basic(self, fp, df_full):
1052
1063
df ["timedelta" ] = pd .timedelta_range ("1 day" , periods = 3 )
1053
1064
check_round_trip (df , fp )
1054
1065
1066
+ def test_columns_dtypes_invalid (self , fp ):
1067
+ df = pd .DataFrame ({"string" : list ("abc" ), "int" : list (range (1 , 4 ))})
1068
+
1069
+ err = TypeError
1070
+ msg = "Column name must be a string"
1071
+
1072
+ # numeric
1073
+ df .columns = [0 , 1 ]
1074
+ self .check_error_on_write (df , fp , err , msg )
1075
+
1076
+ # bytes
1077
+ df .columns = [b"foo" , b"bar" ]
1078
+ self .check_error_on_write (df , fp , err , msg )
1079
+
1080
+ # python object
1081
+ df .columns = [
1082
+ datetime .datetime (2011 , 1 , 1 , 0 , 0 ),
1083
+ datetime .datetime (2011 , 1 , 1 , 1 , 1 ),
1084
+ ]
1085
+ self .check_error_on_write (df , fp , err , msg )
1086
+
1055
1087
def test_duplicate_columns (self , fp ):
1056
1088
# not currently able to handle duplicate columns
1057
1089
df = pd .DataFrame (np .arange (12 ).reshape (4 , 3 ), columns = list ("aaa" )).copy ()
@@ -1221,3 +1253,12 @@ def test_invalid_dtype_backend(self, engine):
1221
1253
df .to_parquet (path )
1222
1254
with pytest .raises (ValueError , match = msg ):
1223
1255
read_parquet (path , dtype_backend = "numpy" )
1256
+
1257
+ def test_empty_columns (self , fp ):
1258
+ # GH 52034
1259
+ df = pd .DataFrame (index = pd .Index (["a" , "b" , "c" ], name = "custom name" ))
1260
+ expected = pd .DataFrame (
1261
+ columns = pd .Index ([], dtype = object ),
1262
+ index = pd .Index (["a" , "b" , "c" ], name = "custom name" ),
1263
+ )
1264
+ check_round_trip (df , fp , expected = expected )
0 commit comments