13
13
import pytest
14
14
15
15
import cudf
16
- from cudf .core ._compat import PANDAS_GE_200
16
+ from cudf .core ._compat import PANDAS_GE_200 , PANDAS_GE_210
17
17
from cudf .testing ._utils import (
18
18
DATETIME_TYPES ,
19
19
NUMERIC_TYPES ,
20
20
TIMEDELTA_TYPES ,
21
21
assert_eq ,
22
+ expect_warning_if ,
22
23
)
23
24
24
25
@@ -95,6 +96,8 @@ def json_files(request, tmp_path_factory, pdf):
95
96
)
96
97
if index is False and orient == "table" :
97
98
pytest .skip ("'index=False' isn't valid when 'orient' is 'table'" )
99
+ if index is True and orient not in ("split" , "table" , "index" , "columns" ):
100
+ pytest .skip ("'index=False' isn't valid when 'orient' is 'table'" )
98
101
fname_df = tmp_path_factory .mktemp ("json" ) / "test_df.json"
99
102
fname_series = tmp_path_factory .mktemp ("json" ) / "test_series.json"
100
103
pdf .to_json (fname_df , index = index , compression = compression , orient = orient )
@@ -338,8 +341,16 @@ def json_input(request, tmp_path_factory):
338
341
@pytest .mark .filterwarnings ("ignore:Using CPU" )
339
342
@pytest .mark .parametrize ("engine" , ["auto" , "cudf" , "pandas" ])
340
343
def test_json_lines_basic (json_input , engine ):
341
- cu_df = cudf .read_json (json_input , engine = engine , lines = True )
342
- pd_df = pd .read_json (json_input , lines = True )
344
+ with expect_warning_if (
345
+ isinstance (json_input , str ) and not json_input .endswith (".json" )
346
+ ):
347
+ cu_df = cudf .read_json (json_input , engine = engine , lines = True )
348
+ with expect_warning_if (
349
+ isinstance (json_input , str )
350
+ and PANDAS_GE_210
351
+ and not json_input .endswith (".json" )
352
+ ):
353
+ pd_df = pd .read_json (json_input , lines = True )
343
354
344
355
assert all (cu_df .dtypes == ["int64" , "int64" , "int64" ])
345
356
for cu_col , pd_col in zip (cu_df .columns , pd_df .columns ):
@@ -353,7 +364,12 @@ def test_json_lines_multiple(tmpdir, json_input, engine):
353
364
tmp_file1 = tmpdir .join ("MultiInputs1.json" )
354
365
tmp_file2 = tmpdir .join ("MultiInputs2.json" )
355
366
356
- pdf = pd .read_json (json_input , lines = True )
367
+ with expect_warning_if (
368
+ isinstance (json_input , str )
369
+ and PANDAS_GE_210
370
+ and not json_input .endswith (".json" )
371
+ ):
372
+ pdf = pd .read_json (json_input , lines = True )
357
373
pdf .to_json (tmp_file1 , compression = "infer" , lines = True , orient = "records" )
358
374
pdf .to_json (tmp_file2 , compression = "infer" , lines = True , orient = "records" )
359
375
@@ -368,7 +384,12 @@ def test_json_lines_multiple(tmpdir, json_input, engine):
368
384
369
385
@pytest .mark .parametrize ("engine" , ["auto" , "cudf" ])
370
386
def test_json_read_directory (tmpdir , json_input , engine ):
371
- pdf = pd .read_json (json_input , lines = True )
387
+ with expect_warning_if (
388
+ isinstance (json_input , str )
389
+ and PANDAS_GE_210
390
+ and not json_input .endswith (".json" )
391
+ ):
392
+ pdf = pd .read_json (json_input , lines = True )
372
393
pdf .to_json (
373
394
tmpdir .join ("MultiInputs1.json" ),
374
395
compression = "infer" ,
@@ -400,37 +421,47 @@ def test_json_read_directory(tmpdir, json_input, engine):
400
421
def test_json_lines_byte_range (json_input ):
401
422
# include the first row and half of the second row
402
423
# should parse the first two rows
403
- df = cudf . read_json (
404
- copy . deepcopy ( json_input ), lines = True , byte_range = ( 0 , 15 )
424
+ will_warn = isinstance ( json_input , str ) and not json_input . endswith (
425
+ ".json"
405
426
)
427
+ with expect_warning_if (will_warn ):
428
+ df = cudf .read_json (
429
+ copy .deepcopy (json_input ), lines = True , byte_range = (0 , 15 )
430
+ )
406
431
assert df .shape == (2 , 3 )
407
432
408
433
# include half of the second row and half of the third row
409
434
# should parse only the third row
410
- df = cudf .read_json (
411
- copy .deepcopy (json_input ), lines = True , byte_range = (15 , 10 )
412
- )
435
+ with expect_warning_if (will_warn ):
436
+ df = cudf .read_json (
437
+ copy .deepcopy (json_input ), lines = True , byte_range = (15 , 10 )
438
+ )
413
439
assert df .shape == (1 , 3 )
414
440
415
441
# include half of the second row and entire third row
416
442
# should parse only the third row
417
- df = cudf .read_json (
418
- copy .deepcopy (json_input ), lines = True , byte_range = (15 , 0 )
419
- )
443
+ with expect_warning_if (will_warn ):
444
+ df = cudf .read_json (
445
+ copy .deepcopy (json_input ), lines = True , byte_range = (15 , 0 )
446
+ )
420
447
assert df .shape == (1 , 3 )
421
448
422
449
# include half of the second row till past the end of the file
423
450
# should parse only the third row
424
- df = cudf .read_json (
425
- copy .deepcopy (json_input ), lines = True , byte_range = (10 , 50 )
426
- )
451
+ with expect_warning_if (will_warn ):
452
+ df = cudf .read_json (
453
+ copy .deepcopy (json_input ), lines = True , byte_range = (10 , 50 )
454
+ )
427
455
assert df .shape == (1 , 3 )
428
456
429
457
430
458
def test_json_lines_dtypes (json_input ):
431
- df = cudf .read_json (
432
- json_input , lines = True , dtype = {1 : "int" , 2 : "short" , 0 : "float" }
433
- )
459
+ with expect_warning_if (
460
+ isinstance (json_input , str ) and not json_input .endswith (".json" )
461
+ ):
462
+ df = cudf .read_json (
463
+ json_input , lines = True , dtype = {1 : "int" , 2 : "short" , 0 : "float" }
464
+ )
434
465
assert all (df .dtypes == ["float64" , "int64" , "int16" ])
435
466
436
467
@@ -470,32 +501,32 @@ def test_json_engine_selection():
470
501
json = "[1, 2, 3]"
471
502
472
503
# should use the cudf engine
473
- df = cudf .read_json (json , lines = True )
504
+ df = cudf .read_json (StringIO ( json ) , lines = True )
474
505
# column names are strings when parsing with cudf
475
506
for col_name in df .columns :
476
507
assert isinstance (col_name , str )
477
508
478
509
# should use the pandas engine
479
- df = cudf .read_json (json , lines = False , engine = "pandas" )
510
+ df = cudf .read_json (StringIO ( json ) , lines = False , engine = "pandas" )
480
511
# column names are ints when parsing with pandas
481
512
for col_name in df .columns :
482
513
assert isinstance (col_name , int )
483
514
484
515
# should use the pandas engine
485
- df = cudf .read_json (json , lines = True , engine = "pandas" )
516
+ df = cudf .read_json (StringIO ( json ) , lines = True , engine = "pandas" )
486
517
# column names are ints when parsing with pandas
487
518
for col_name in df .columns :
488
519
assert isinstance (col_name , int )
489
520
490
521
# should raise an exception
491
522
with pytest .raises (ValueError ):
492
- cudf .read_json (json , lines = False , engine = "cudf_legacy" )
523
+ cudf .read_json (StringIO ( json ) , lines = False , engine = "cudf_legacy" )
493
524
494
525
495
526
def test_json_bool_values ():
496
527
buffer = "[true,1]\n [false,false]\n [true,true]"
497
- cu_df = cudf .read_json (buffer , lines = True )
498
- pd_df = pd .read_json (buffer , lines = True )
528
+ cu_df = cudf .read_json (StringIO ( buffer ) , lines = True )
529
+ pd_df = pd .read_json (StringIO ( buffer ) , lines = True )
499
530
500
531
# types should be ['bool', 'int64']
501
532
np .testing .assert_array_equal (pd_df .dtypes , cu_df .dtypes )
@@ -504,7 +535,7 @@ def test_json_bool_values():
504
535
np .testing .assert_array_equal (pd_df [1 ], cu_df ["1" ].to_numpy ())
505
536
506
537
cu_df = cudf .read_json (
507
- buffer , lines = True , dtype = {"0" : "bool" , "1" : "long" }
538
+ StringIO ( buffer ) , lines = True , dtype = {"0" : "bool" , "1" : "long" }
508
539
)
509
540
np .testing .assert_array_equal (pd_df .dtypes , cu_df .dtypes )
510
541
@@ -522,7 +553,7 @@ def test_json_bool_values():
522
553
],
523
554
)
524
555
def test_json_null_literal (buffer ):
525
- df = cudf .read_json (buffer , lines = True , engine = "cudf_legacy" )
556
+ df = cudf .read_json (StringIO ( buffer ) , lines = True , engine = "cudf_legacy" )
526
557
527
558
# first column contains a null field, type should be set to float
528
559
# second column contains only empty fields, type should be set to int8
@@ -534,7 +565,7 @@ def test_json_null_literal(buffer):
534
565
535
566
536
567
def test_json_bad_protocol_string ():
537
- test_string = '{"field": "s3://path"}'
568
+ test_string = StringIO ( '{"field": "s3://path"}' )
538
569
539
570
expect = pd .DataFrame ([{"field" : "s3://path" }])
540
571
got = cudf .read_json (test_string , lines = True )
@@ -748,7 +779,7 @@ def test_default_integer_bitwidth_extremes(default_integer_bitwidth, engine):
748
779
def test_default_float_bitwidth (default_float_bitwidth ):
749
780
# Test that float columns in json are _inferred_ as 32 bit columns.
750
781
df = cudf .read_json (
751
- '{"a": 1.0, "b": 2.5}\n {"a": 3.5, "b": 4.0}' ,
782
+ StringIO ( '{"a": 1.0, "b": 2.5}\n {"a": 3.5, "b": 4.0}' ) ,
752
783
engine = "cudf" ,
753
784
lines = True ,
754
785
orient = "records" ,
@@ -1231,7 +1262,7 @@ def test_json_round_trip_gzip():
1231
1262
@pytest .mark .parametrize ("lines" , [True , False ])
1232
1263
def test_json_array_of_arrays (data , lines ):
1233
1264
data = data if lines else "[" + data .replace ("\n " , "," ) + "]"
1234
- pdf = pd .read_json (data , orient = "values" , lines = lines )
1265
+ pdf = pd .read_json (StringIO ( data ) , orient = "values" , lines = lines )
1235
1266
df = cudf .read_json (
1236
1267
StringIO (data ),
1237
1268
engine = "cudf" ,
@@ -1325,8 +1356,8 @@ def _replace_with_nulls(df, replace_items):
1325
1356
1326
1357
# both json lines and json string tested.
1327
1358
json_string = "[" + jsonl_string .replace ("\n " , "," ) + "]"
1328
- pdf = pd .read_json (jsonl_string , orient = "records" , lines = True )
1329
- pdf2 = pd .read_json (json_string , orient = "records" , lines = False )
1359
+ pdf = pd .read_json (StringIO ( jsonl_string ) , orient = "records" , lines = True )
1360
+ pdf2 = pd .read_json (StringIO ( json_string ) , orient = "records" , lines = False )
1330
1361
assert_eq (pdf , pdf2 )
1331
1362
# replace list elements with None if it has dict and non-dict
1332
1363
# in above test cases, these items are mixed with dict/list items
0 commit comments