Skip to content

Commit d7dc16e

Browse files
Deprecate reading literal string in cudf.read_json (#14619)
This PR deprecates reading literal strings in read_json, instead users will need to pass StringIO for these cases to silence the warning. This change is to match: pandas-dev/pandas#53409 On pandas_2.0_feature_branch: = 501 failed, 101106 passed, 2071 skipped, 786 xfailed, 312 xpassed, 20 errors in 1234.91s (0:20:34) = This PR: = 426 failed, 101181 passed, 2091 skipped, 786 xfailed, 312 xpassed in 1126.93s (0:18:46) =
1 parent 72221b3 commit d7dc16e

File tree

4 files changed

+96
-34
lines changed

4 files changed

+96
-34
lines changed

python/cudf/cudf/io/json.py

+2
Original file line numberDiff line numberDiff line change
@@ -102,6 +102,8 @@ def read_json(
102102
iotypes=(BytesIO, StringIO),
103103
allow_raw_text_input=True,
104104
storage_options=storage_options,
105+
warn_on_raw_text_input=True,
106+
warn_meta=("json", "read_json"),
105107
)
106108
if isinstance(tmp_source, list):
107109
filepaths_or_buffers.extend(tmp_source)

python/cudf/cudf/tests/test_json.py

+63-32
Original file line numberDiff line numberDiff line change
@@ -13,12 +13,13 @@
1313
import pytest
1414

1515
import cudf
16-
from cudf.core._compat import PANDAS_GE_200
16+
from cudf.core._compat import PANDAS_GE_200, PANDAS_GE_210
1717
from cudf.testing._utils import (
1818
DATETIME_TYPES,
1919
NUMERIC_TYPES,
2020
TIMEDELTA_TYPES,
2121
assert_eq,
22+
expect_warning_if,
2223
)
2324

2425

@@ -95,6 +96,8 @@ def json_files(request, tmp_path_factory, pdf):
9596
)
9697
if index is False and orient == "table":
9798
pytest.skip("'index=False' isn't valid when 'orient' is 'table'")
99+
if index is True and orient not in ("split", "table", "index", "columns"):
100+
pytest.skip("'index=False' isn't valid when 'orient' is 'table'")
98101
fname_df = tmp_path_factory.mktemp("json") / "test_df.json"
99102
fname_series = tmp_path_factory.mktemp("json") / "test_series.json"
100103
pdf.to_json(fname_df, index=index, compression=compression, orient=orient)
@@ -338,8 +341,16 @@ def json_input(request, tmp_path_factory):
338341
@pytest.mark.filterwarnings("ignore:Using CPU")
339342
@pytest.mark.parametrize("engine", ["auto", "cudf", "pandas"])
340343
def test_json_lines_basic(json_input, engine):
341-
cu_df = cudf.read_json(json_input, engine=engine, lines=True)
342-
pd_df = pd.read_json(json_input, lines=True)
344+
with expect_warning_if(
345+
isinstance(json_input, str) and not json_input.endswith(".json")
346+
):
347+
cu_df = cudf.read_json(json_input, engine=engine, lines=True)
348+
with expect_warning_if(
349+
isinstance(json_input, str)
350+
and PANDAS_GE_210
351+
and not json_input.endswith(".json")
352+
):
353+
pd_df = pd.read_json(json_input, lines=True)
343354

344355
assert all(cu_df.dtypes == ["int64", "int64", "int64"])
345356
for cu_col, pd_col in zip(cu_df.columns, pd_df.columns):
@@ -353,7 +364,12 @@ def test_json_lines_multiple(tmpdir, json_input, engine):
353364
tmp_file1 = tmpdir.join("MultiInputs1.json")
354365
tmp_file2 = tmpdir.join("MultiInputs2.json")
355366

356-
pdf = pd.read_json(json_input, lines=True)
367+
with expect_warning_if(
368+
isinstance(json_input, str)
369+
and PANDAS_GE_210
370+
and not json_input.endswith(".json")
371+
):
372+
pdf = pd.read_json(json_input, lines=True)
357373
pdf.to_json(tmp_file1, compression="infer", lines=True, orient="records")
358374
pdf.to_json(tmp_file2, compression="infer", lines=True, orient="records")
359375

@@ -368,7 +384,12 @@ def test_json_lines_multiple(tmpdir, json_input, engine):
368384

369385
@pytest.mark.parametrize("engine", ["auto", "cudf"])
370386
def test_json_read_directory(tmpdir, json_input, engine):
371-
pdf = pd.read_json(json_input, lines=True)
387+
with expect_warning_if(
388+
isinstance(json_input, str)
389+
and PANDAS_GE_210
390+
and not json_input.endswith(".json")
391+
):
392+
pdf = pd.read_json(json_input, lines=True)
372393
pdf.to_json(
373394
tmpdir.join("MultiInputs1.json"),
374395
compression="infer",
@@ -400,37 +421,47 @@ def test_json_read_directory(tmpdir, json_input, engine):
400421
def test_json_lines_byte_range(json_input):
401422
# include the first row and half of the second row
402423
# should parse the first two rows
403-
df = cudf.read_json(
404-
copy.deepcopy(json_input), lines=True, byte_range=(0, 15)
424+
will_warn = isinstance(json_input, str) and not json_input.endswith(
425+
".json"
405426
)
427+
with expect_warning_if(will_warn):
428+
df = cudf.read_json(
429+
copy.deepcopy(json_input), lines=True, byte_range=(0, 15)
430+
)
406431
assert df.shape == (2, 3)
407432

408433
# include half of the second row and half of the third row
409434
# should parse only the third row
410-
df = cudf.read_json(
411-
copy.deepcopy(json_input), lines=True, byte_range=(15, 10)
412-
)
435+
with expect_warning_if(will_warn):
436+
df = cudf.read_json(
437+
copy.deepcopy(json_input), lines=True, byte_range=(15, 10)
438+
)
413439
assert df.shape == (1, 3)
414440

415441
# include half of the second row and entire third row
416442
# should parse only the third row
417-
df = cudf.read_json(
418-
copy.deepcopy(json_input), lines=True, byte_range=(15, 0)
419-
)
443+
with expect_warning_if(will_warn):
444+
df = cudf.read_json(
445+
copy.deepcopy(json_input), lines=True, byte_range=(15, 0)
446+
)
420447
assert df.shape == (1, 3)
421448

422449
# include half of the second row till past the end of the file
423450
# should parse only the third row
424-
df = cudf.read_json(
425-
copy.deepcopy(json_input), lines=True, byte_range=(10, 50)
426-
)
451+
with expect_warning_if(will_warn):
452+
df = cudf.read_json(
453+
copy.deepcopy(json_input), lines=True, byte_range=(10, 50)
454+
)
427455
assert df.shape == (1, 3)
428456

429457

430458
def test_json_lines_dtypes(json_input):
431-
df = cudf.read_json(
432-
json_input, lines=True, dtype={1: "int", 2: "short", 0: "float"}
433-
)
459+
with expect_warning_if(
460+
isinstance(json_input, str) and not json_input.endswith(".json")
461+
):
462+
df = cudf.read_json(
463+
json_input, lines=True, dtype={1: "int", 2: "short", 0: "float"}
464+
)
434465
assert all(df.dtypes == ["float64", "int64", "int16"])
435466

436467

@@ -470,32 +501,32 @@ def test_json_engine_selection():
470501
json = "[1, 2, 3]"
471502

472503
# should use the cudf engine
473-
df = cudf.read_json(json, lines=True)
504+
df = cudf.read_json(StringIO(json), lines=True)
474505
# column names are strings when parsing with cudf
475506
for col_name in df.columns:
476507
assert isinstance(col_name, str)
477508

478509
# should use the pandas engine
479-
df = cudf.read_json(json, lines=False, engine="pandas")
510+
df = cudf.read_json(StringIO(json), lines=False, engine="pandas")
480511
# column names are ints when parsing with pandas
481512
for col_name in df.columns:
482513
assert isinstance(col_name, int)
483514

484515
# should use the pandas engine
485-
df = cudf.read_json(json, lines=True, engine="pandas")
516+
df = cudf.read_json(StringIO(json), lines=True, engine="pandas")
486517
# column names are ints when parsing with pandas
487518
for col_name in df.columns:
488519
assert isinstance(col_name, int)
489520

490521
# should raise an exception
491522
with pytest.raises(ValueError):
492-
cudf.read_json(json, lines=False, engine="cudf_legacy")
523+
cudf.read_json(StringIO(json), lines=False, engine="cudf_legacy")
493524

494525

495526
def test_json_bool_values():
496527
buffer = "[true,1]\n[false,false]\n[true,true]"
497-
cu_df = cudf.read_json(buffer, lines=True)
498-
pd_df = pd.read_json(buffer, lines=True)
528+
cu_df = cudf.read_json(StringIO(buffer), lines=True)
529+
pd_df = pd.read_json(StringIO(buffer), lines=True)
499530

500531
# types should be ['bool', 'int64']
501532
np.testing.assert_array_equal(pd_df.dtypes, cu_df.dtypes)
@@ -504,7 +535,7 @@ def test_json_bool_values():
504535
np.testing.assert_array_equal(pd_df[1], cu_df["1"].to_numpy())
505536

506537
cu_df = cudf.read_json(
507-
buffer, lines=True, dtype={"0": "bool", "1": "long"}
538+
StringIO(buffer), lines=True, dtype={"0": "bool", "1": "long"}
508539
)
509540
np.testing.assert_array_equal(pd_df.dtypes, cu_df.dtypes)
510541

@@ -522,7 +553,7 @@ def test_json_bool_values():
522553
],
523554
)
524555
def test_json_null_literal(buffer):
525-
df = cudf.read_json(buffer, lines=True, engine="cudf_legacy")
556+
df = cudf.read_json(StringIO(buffer), lines=True, engine="cudf_legacy")
526557

527558
# first column contains a null field, type should be set to float
528559
# second column contains only empty fields, type should be set to int8
@@ -534,7 +565,7 @@ def test_json_null_literal(buffer):
534565

535566

536567
def test_json_bad_protocol_string():
537-
test_string = '{"field": "s3://path"}'
568+
test_string = StringIO('{"field": "s3://path"}')
538569

539570
expect = pd.DataFrame([{"field": "s3://path"}])
540571
got = cudf.read_json(test_string, lines=True)
@@ -748,7 +779,7 @@ def test_default_integer_bitwidth_extremes(default_integer_bitwidth, engine):
748779
def test_default_float_bitwidth(default_float_bitwidth):
749780
# Test that float columns in json are _inferred_ as 32 bit columns.
750781
df = cudf.read_json(
751-
'{"a": 1.0, "b": 2.5}\n{"a": 3.5, "b": 4.0}',
782+
StringIO('{"a": 1.0, "b": 2.5}\n{"a": 3.5, "b": 4.0}'),
752783
engine="cudf",
753784
lines=True,
754785
orient="records",
@@ -1231,7 +1262,7 @@ def test_json_round_trip_gzip():
12311262
@pytest.mark.parametrize("lines", [True, False])
12321263
def test_json_array_of_arrays(data, lines):
12331264
data = data if lines else "[" + data.replace("\n", ",") + "]"
1234-
pdf = pd.read_json(data, orient="values", lines=lines)
1265+
pdf = pd.read_json(StringIO(data), orient="values", lines=lines)
12351266
df = cudf.read_json(
12361267
StringIO(data),
12371268
engine="cudf",
@@ -1325,8 +1356,8 @@ def _replace_with_nulls(df, replace_items):
13251356

13261357
# both json lines and json string tested.
13271358
json_string = "[" + jsonl_string.replace("\n", ",") + "]"
1328-
pdf = pd.read_json(jsonl_string, orient="records", lines=True)
1329-
pdf2 = pd.read_json(json_string, orient="records", lines=False)
1359+
pdf = pd.read_json(StringIO(jsonl_string), orient="records", lines=True)
1360+
pdf2 = pd.read_json(StringIO(json_string), orient="records", lines=False)
13301361
assert_eq(pdf, pdf2)
13311362
# replace list elements with None if it has dict and non-dict
13321363
# in above test cases, these items are mixed with dict/list items

python/cudf/cudf/tests/test_s3.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
import os
44
import socket
55
from contextlib import contextmanager
6-
from io import BytesIO
6+
from io import BytesIO, StringIO
77

88
import numpy as np
99
import pandas as pd
@@ -433,7 +433,7 @@ def test_read_json(s3_base, s3so):
433433
storage_options=s3so,
434434
)
435435

436-
expect = pd.read_json(buffer, lines=True)
436+
expect = pd.read_json(StringIO(buffer), lines=True)
437437
assert_eq(expect, got)
438438

439439

python/cudf/cudf/utils/ioutils.py

+29
Original file line numberDiff line numberDiff line change
@@ -1666,6 +1666,8 @@ def get_reader_filepath_or_buffer(
16661666
allow_raw_text_input=False,
16671667
storage_options=None,
16681668
bytes_per_thread=_BYTES_PER_THREAD_DEFAULT,
1669+
warn_on_raw_text_input=None,
1670+
warn_meta=None,
16691671
):
16701672
"""{docstring}"""
16711673

@@ -1679,6 +1681,15 @@ def get_reader_filepath_or_buffer(
16791681
path_or_data, storage_options
16801682
)
16811683
if fs is None:
1684+
if warn_on_raw_text_input:
1685+
# Do not remove until pandas 3.0 support is added.
1686+
warnings.warn(
1687+
f"Passing literal {warn_meta[0]} to {warn_meta[1]} is "
1688+
"deprecated and will be removed in a future version. "
1689+
"To read from a literal string, wrap it in a "
1690+
"'StringIO' object.",
1691+
FutureWarning,
1692+
)
16821693
return path_or_data, compression
16831694

16841695
if _is_local_filesystem(fs):
@@ -1691,6 +1702,24 @@ def get_reader_filepath_or_buffer(
16911702
raise FileNotFoundError(
16921703
f"{path_or_data} could not be resolved to any files"
16931704
)
1705+
elif warn_on_raw_text_input:
1706+
# Do not remove until pandas 3.0 support is added.
1707+
warnings.warn(
1708+
f"Passing literal {warn_meta[0]} to {warn_meta[1]} is "
1709+
"deprecated and will be removed in a future version. "
1710+
"To read from a literal string, wrap it in a "
1711+
"'StringIO' object.",
1712+
FutureWarning,
1713+
)
1714+
elif warn_on_raw_text_input:
1715+
# Do not remove until pandas 3.0 support is added.
1716+
warnings.warn(
1717+
f"Passing literal {warn_meta[0]} to {warn_meta[1]} is "
1718+
"deprecated and will be removed in a future version. "
1719+
"To read from a literal string, wrap it in a "
1720+
"'StringIO' object.",
1721+
FutureWarning,
1722+
)
16941723

16951724
else:
16961725
if len(paths) == 0:

0 commit comments

Comments
 (0)