DEPR: Deprecate literal json string input to read_json (pandas-dev#53409)

rmhowe425 · im-vinicius · commit 1efeb2b03219 · 2023-07-08T12:28:09.000+02:00
* Adding logic to throw a deprecation warning when a literal json string is passed to read_json

* Adding logic to throw a deprecation warning when a literal json string is passed to read_json

* Updating documentation and adding PR num to unit test

* Adding a deprecation warning to the user guide

* Updating unit tests to check for FutureWarning

* Fixing unit tests

* Fixing unit tests

* Fixing unit tests

* Fixing unit tests

* Fixing documentation errors in PR feedback

* Fixing documentation errors in PR feedback

* Updating unit tests to use StringIO rather than catch FutureWarning

* Finishing updating unit tests to use StringIO rather than catch FutureWarning

* Fixing indendation errors in unit tests. Moved one unit test to another file.

* Updating unit test name

* Adding additional checks to unit tests

* Fixing unit tests

* Fixing unit tests

* Updating whatsnew documentation per reviewer recommendations.

* Fixing failing code tests

* Fixing failing code tests

* Adding import to doc string example

* Fixing documentation formatting error

* Fixing documentation formatting error

* Fixing documentation error after fixing merge conflict

* Fixing formatting errors in whatsnew file

* Updating formatting errors in documentation

* Updating formatting errors in documentation
diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst
@@ -2111,7 +2111,8 @@ Reading from a JSON string:
 
 .. ipython:: python
 
-   pd.read_json(json)
+   from io import StringIO
+   pd.read_json(StringIO(json))
 
 Reading from a file:
 
@@ -2135,6 +2136,7 @@ Preserve string indices:
 
 .. ipython:: python
 
+   from io import StringIO
    si = pd.DataFrame(
        np.zeros((4, 4)), columns=list(range(4)), index=[str(i) for i in range(4)]
    )
@@ -2143,7 +2145,7 @@ Preserve string indices:
    si.columns
    json = si.to_json()
 
-   sij = pd.read_json(json, convert_axes=False)
+   sij = pd.read_json(StringIO(json), convert_axes=False)
    sij
    sij.index
    sij.columns
@@ -2152,18 +2154,19 @@ Dates written in nanoseconds need to be read back in nanoseconds:
 
 .. ipython:: python
 
+   from io import StringIO
    json = dfj2.to_json(date_unit="ns")
 
    # Try to parse timestamps as milliseconds -> Won't Work
-   dfju = pd.read_json(json, date_unit="ms")
+   dfju = pd.read_json(StringIO(json), date_unit="ms")
    dfju
 
    # Let pandas detect the correct precision
-   dfju = pd.read_json(json)
+   dfju = pd.read_json(StringIO(json))
    dfju
 
    # Or specify that all timestamps are in nanoseconds
-   dfju = pd.read_json(json, date_unit="ns")
+   dfju = pd.read_json(StringIO(json), date_unit="ns")
    dfju
 
 By setting the ``dtype_backend`` argument you can control the default dtypes used for the resulting DataFrame.
@@ -2251,11 +2254,12 @@ For line-delimited json files, pandas can also return an iterator which reads in
 
 .. ipython:: python
 
+  from io import StringIO
   jsonl = """
       {"a": 1, "b": 2}
       {"a": 3, "b": 4}
   """
-  df = pd.read_json(jsonl, lines=True)
+  df = pd.read_json(StringIO(jsonl), lines=True)
   df
   df.to_json(orient="records", lines=True)
 
diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst
@@ -474,19 +474,22 @@ upon serialization. (Related issue :issue:`12997`)
 
 .. code-block:: ipython
 
-    In [4]: a.to_json(date_format='iso')
-    Out[4]: '{"2020-12-28T00:00:00.000Z":0,"2020-12-28T01:00:00.000Z":1,"2020-12-28T02:00:00.000Z":2}'
+    In [4]: from io import StringIO
 
-    In [5]: pd.read_json(a.to_json(date_format='iso'), typ="series").index == a.index
-    Out[5]: array([False, False, False])
+    In [5]: a.to_json(date_format='iso')
+    Out[5]: '{"2020-12-28T00:00:00.000Z":0,"2020-12-28T01:00:00.000Z":1,"2020-12-28T02:00:00.000Z":2}'
+
+    In [6]: pd.read_json(StringIO(a.to_json(date_format='iso')), typ="series").index == a.index
+    Out[6]: array([False, False, False])
 
 *New Behavior*
 
 .. ipython:: python
 
+    from io import StringIO
     a.to_json(date_format='iso')
     # Roundtripping now works
-    pd.read_json(a.to_json(date_format='iso'), typ="series").index == a.index
+    pd.read_json(StringIO(a.to_json(date_format='iso')), typ="series").index == a.index
 
 .. _whatsnew_150.notable_bug_fixes.groupby_value_counts_categorical:
 
diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst
@@ -293,6 +293,7 @@ Deprecations
 - Deprecated behavior of :func:`assert_series_equal` and :func:`assert_frame_equal` considering NA-like values (e.g. ``NaN`` vs ``None`` as equivalent) (:issue:`52081`)
 - Deprecated constructing :class:`SparseArray` from scalar data, pass a sequence instead (:issue:`53039`)
 - Deprecated falling back to filling when ``value`` is not specified in :meth:`DataFrame.replace` and :meth:`Series.replace` with non-dict-like ``to_replace`` (:issue:`33302`)
+- Deprecated literal json input to :func:`read_json`. Wrap literal json string input in ``io.StringIO`` instead. (:issue:`53409`)
 - Deprecated option "mode.use_inf_as_na", convert inf entries to ``NaN`` before instead (:issue:`51684`)
 - Deprecated parameter ``obj`` in :meth:`GroupBy.get_group` (:issue:`53545`)
 - Deprecated positional indexing on :class:`Series` with :meth:`Series.__getitem__` and :meth:`Series.__setitem__`, in a future version ``ser[item]`` will *always* interpret ``item`` as a label, not a position (:issue:`50617`)
diff --git a/pandas/io/json/_json.py b/pandas/io/json/_json.py
@@ -18,6 +18,7 @@
     TypeVar,
     overload,
 )
+import warnings
 
 import numpy as np
 
@@ -30,6 +31,7 @@
 from pandas.compat._optional import import_optional_dependency
 from pandas.errors import AbstractMethodError
 from pandas.util._decorators import doc
+from pandas.util._exceptions import find_stack_level
 from pandas.util._validators import check_dtype_backend
 
 from pandas.core.dtypes.common import ensure_str
@@ -535,6 +537,10 @@ def read_json(
         By file-like object, we refer to objects with a ``read()`` method,
         such as a file handle (e.g. via builtin ``open`` function)
         or ``StringIO``.
+
+        .. deprecated:: 2.1.0
+            Passing json literal strings is deprecated.
+
     orient : str, optional
         Indication of expected JSON string format.
         Compatible JSON strings can be produced by ``to_json()`` with a
@@ -695,6 +701,7 @@ def read_json(
 
     Examples
     --------
+    >>> from io import StringIO
     >>> df = pd.DataFrame([['a', 'b'], ['c', 'd']],
     ...                   index=['row 1', 'row 2'],
     ...                   columns=['col 1', 'col 2'])
@@ -709,7 +716,7 @@ def read_json(
 "data":[["a","b"],["c","d"]]\
 }}\
 '
-    >>> pd.read_json(_, orient='split')
+    >>> pd.read_json(StringIO(_), orient='split')
           col 1 col 2
     row 1     a     b
     row 2     c     d
@@ -719,7 +726,7 @@ def read_json(
     >>> df.to_json(orient='index')
     '{{"row 1":{{"col 1":"a","col 2":"b"}},"row 2":{{"col 1":"c","col 2":"d"}}}}'
 
-    >>> pd.read_json(_, orient='index')
+    >>> pd.read_json(StringIO(_), orient='index')
           col 1 col 2
     row 1     a     b
     row 2     c     d
@@ -729,7 +736,7 @@ def read_json(
 
     >>> df.to_json(orient='records')
     '[{{"col 1":"a","col 2":"b"}},{{"col 1":"c","col 2":"d"}}]'
-    >>> pd.read_json(_, orient='records')
+    >>> pd.read_json(StringIO(_), orient='records')
       col 1 col 2
     0     a     b
     1     c     d
@@ -860,6 +867,18 @@ def __init__(
             self.nrows = validate_integer("nrows", self.nrows, 0)
             if not self.lines:
                 raise ValueError("nrows can only be passed if lines=True")
+        if (
+            isinstance(filepath_or_buffer, str)
+            and not self.lines
+            and "\n" in filepath_or_buffer
+        ):
+            warnings.warn(
+                "Passing literal json to 'read_json' is deprecated and "
+                "will be removed in a future version. To read from a "
+                "literal string, wrap it in a 'StringIO' object.",
+                FutureWarning,
+                stacklevel=find_stack_level(),
+            )
         if self.engine == "pyarrow":
             if not self.lines:
                 raise ValueError(
@@ -925,7 +944,14 @@ def _get_data_from_filepath(self, filepath_or_buffer):
             and not file_exists(filepath_or_buffer)
         ):
             raise FileNotFoundError(f"File {filepath_or_buffer} does not exist")
-
+        else:
+            warnings.warn(
+                "Passing literal json to 'read_json' is deprecated and "
+                "will be removed in a future version. To read from a "
+                "literal string, wrap it in a 'StringIO' object.",
+                FutureWarning,
+                stacklevel=find_stack_level(),
+            )
         return filepath_or_buffer
 
     def _combine_lines(self, lines) -> str:
diff --git a/pandas/tests/io/json/test_compression.py b/pandas/tests/io/json/test_compression.py
@@ -1,4 +1,7 @@
-from io import BytesIO
+from io import (
+    BytesIO,
+    StringIO,
+)
 
 import pytest
 
@@ -22,7 +25,8 @@ def test_compression_roundtrip(compression):
         # explicitly ensure file was compressed.
         with tm.decompress_file(path, compression) as fh:
             result = fh.read().decode("utf8")
-        tm.assert_frame_equal(df, pd.read_json(result))
+            data = StringIO(result)
+        tm.assert_frame_equal(df, pd.read_json(data))
 
 
 def test_read_zipped_json(datapath):
@@ -39,8 +43,7 @@ def test_read_zipped_json(datapath):
 @pytest.mark.single_cpu
 def test_with_s3_url(compression, s3_resource, s3so):
     # Bucket "pandas-test" created in tests/io/conftest.py
-
-    df = pd.read_json('{"a": [1, 2, 3], "b": [4, 5, 6]}')
+    df = pd.read_json(StringIO('{"a": [1, 2, 3], "b": [4, 5, 6]}'))
 
     with tm.ensure_clean() as path:
         df.to_json(path, compression=compression)
@@ -55,15 +58,15 @@ def test_with_s3_url(compression, s3_resource, s3so):
 
 def test_lines_with_compression(compression):
     with tm.ensure_clean() as path:
-        df = pd.read_json('{"a": [1, 2, 3], "b": [4, 5, 6]}')
+        df = pd.read_json(StringIO('{"a": [1, 2, 3], "b": [4, 5, 6]}'))
         df.to_json(path, orient="records", lines=True, compression=compression)
         roundtripped_df = pd.read_json(path, lines=True, compression=compression)
         tm.assert_frame_equal(df, roundtripped_df)
 
 
 def test_chunksize_with_compression(compression):
     with tm.ensure_clean() as path:
-        df = pd.read_json('{"a": ["foo", "bar", "baz"], "b": [4, 5, 6]}')
+        df = pd.read_json(StringIO('{"a": ["foo", "bar", "baz"], "b": [4, 5, 6]}'))
         df.to_json(path, orient="records", lines=True, compression=compression)
 
         with pd.read_json(
@@ -74,7 +77,7 @@ def test_chunksize_with_compression(compression):
 
 
 def test_write_unsupported_compression_type():
-    df = pd.read_json('{"a": [1, 2, 3], "b": [4, 5, 6]}')
+    df = pd.read_json(StringIO('{"a": [1, 2, 3], "b": [4, 5, 6]}'))
     with tm.ensure_clean() as path:
         msg = "Unrecognized compression type: unsupported"
         with pytest.raises(ValueError, match=msg):
diff --git a/pandas/tests/io/json/test_deprecated_kwargs.py b/pandas/tests/io/json/test_deprecated_kwargs.py
@@ -1,6 +1,7 @@
 """
 Tests for the deprecated keyword arguments for `read_json`.
 """
+from io import StringIO
 
 import pandas as pd
 import pandas._testing as tm
@@ -10,9 +11,11 @@
 
 def test_good_kwargs():
     df = pd.DataFrame({"A": [2, 4, 6], "B": [3, 6, 9]}, index=[0, 1, 2])
+
     with tm.assert_produces_warning(None):
-        tm.assert_frame_equal(df, read_json(df.to_json(orient="split"), orient="split"))
-        tm.assert_frame_equal(
-            df, read_json(df.to_json(orient="columns"), orient="columns")
-        )
-        tm.assert_frame_equal(df, read_json(df.to_json(orient="index"), orient="index"))
+        data1 = StringIO(df.to_json(orient="split"))
+        tm.assert_frame_equal(df, read_json(data1, orient="split"))
+        data2 = StringIO(df.to_json(orient="columns"))
+        tm.assert_frame_equal(df, read_json(data2, orient="columns"))
+        data3 = StringIO(df.to_json(orient="index"))
+        tm.assert_frame_equal(df, read_json(data3, orient="index"))
diff --git a/pandas/tests/io/json/test_json_table_schema.py b/pandas/tests/io/json/test_json_table_schema.py
@@ -1,5 +1,6 @@
 """Tests for Table Schema integration."""
 from collections import OrderedDict
+from io import StringIO
 import json
 
 import numpy as np
@@ -254,7 +255,8 @@ def test_read_json_from_to_json_results(self):
                 "name_en": {"row_0": "Hakata Dolls Matsuo"},
             }
         )
-        result1 = pd.read_json(df.to_json())
+
+        result1 = pd.read_json(StringIO(df.to_json()))
         result2 = DataFrame.from_dict(json.loads(df.to_json()))
         tm.assert_frame_equal(result1, df)
         tm.assert_frame_equal(result2, df)
@@ -794,7 +796,7 @@ def test_comprehensive(self):
             index=pd.Index(range(4), name="idx"),
         )
 
-        out = df.to_json(orient="table")
+        out = StringIO(df.to_json(orient="table"))
         result = pd.read_json(out, orient="table")
         tm.assert_frame_equal(df, result)
 
@@ -810,15 +812,15 @@ def test_multiindex(self, index_names):
             columns=["Aussprache", "Griechisch", "Args"],
         )
         df.index.names = index_names
-        out = df.to_json(orient="table")
+        out = StringIO(df.to_json(orient="table"))
         result = pd.read_json(out, orient="table")
         tm.assert_frame_equal(df, result)
 
     def test_empty_frame_roundtrip(self):
         # GH 21287
         df = DataFrame(columns=["a", "b", "c"])
         expected = df.copy()
-        out = df.to_json(orient="table")
+        out = StringIO(df.to_json(orient="table"))
         result = pd.read_json(out, orient="table")
         tm.assert_frame_equal(expected, result)
 
@@ -841,5 +843,5 @@ def test_read_json_orient_table_old_schema_version(self):
         }
         """
         expected = DataFrame({"a": [1, 2.0, "s"]})
-        result = pd.read_json(df_json, orient="table")
+        result = pd.read_json(StringIO(df_json), orient="table")
         tm.assert_frame_equal(expected, result)
diff --git a/pandas/tests/io/json/test_json_table_schema_ext_dtype.py b/pandas/tests/io/json/test_json_table_schema_ext_dtype.py
@@ -3,6 +3,7 @@
 from collections import OrderedDict
 import datetime as dt
 import decimal
+from io import StringIO
 import json
 
 import pytest
@@ -287,7 +288,7 @@ def test_json_ext_dtype_reading_roundtrip(self):
         )
         expected = df.copy()
         data_json = df.to_json(orient="table", indent=4)
-        result = read_json(data_json, orient="table")
+        result = read_json(StringIO(data_json), orient="table")
         tm.assert_frame_equal(result, expected)
 
     def test_json_ext_dtype_reading(self):
@@ -311,6 +312,6 @@ def test_json_ext_dtype_reading(self):
                 }
             ]
         }"""
-        result = read_json(data_json, orient="table")
+        result = read_json(StringIO(data_json), orient="table")
         expected = DataFrame({"a": Series([2, NA], dtype="Int64")})
         tm.assert_frame_equal(result, expected)
diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py
diff --git a/pandas/tests/io/json/test_readlines.py b/pandas/tests/io/json/test_readlines.py