Skip to content

ENH: Deprecate literal json string input to read_json #53330

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 6 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 1 addition & 10 deletions doc/source/user_guide/io.rst
Original file line number Diff line number Diff line change
Expand Up @@ -2028,10 +2028,7 @@ Reading a JSON string to pandas object can take a number of parameters.
The parser will try to parse a ``DataFrame`` if ``typ`` is not supplied or
is ``None``. To explicitly force ``Series`` parsing, pass ``typ=series``

* ``filepath_or_buffer`` : a **VALID** JSON string or file handle / StringIO. The string could be
a URL. Valid URL schemes include http, ftp, S3, and file. For file URLs, a host
is expected. For instance, a local file could be
file ://localhost/path/to/table.json
* ``filepath_or_buffer`` : a **VALID** file handle or StringIO object.
* ``typ`` : type of object to recover (series or frame), default 'frame'
* ``orient`` :

Expand Down Expand Up @@ -2107,12 +2104,6 @@ preserve string-like numbers (e.g. '1', '2') in an axes.

Thus there are times where you may want to specify specific dtypes via the ``dtype`` keyword argument.

Reading from a JSON string:

.. ipython:: python

pd.read_json(json)

Reading from a file:

.. ipython:: python
Expand Down
1 change: 1 addition & 0 deletions doc/source/whatsnew/v2.1.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -266,6 +266,7 @@ Deprecations
- Deprecated allowing ``downcast`` keyword other than ``None``, ``False``, "infer", or a dict with these as values in :meth:`Series.fillna`, :meth:`DataFrame.fillna` (:issue:`40988`)
- Deprecated allowing arbitrary ``fill_value`` in :class:`SparseDtype`, in a future version the ``fill_value`` will need to be compatible with the ``dtype.subtype``, either a scalar that can be held by that subtype or ``NaN`` for integer or bool subtypes (:issue:`23124`)
- Deprecated constructing :class:`SparseArray` from scalar data, pass a sequence instead (:issue:`53039`)
- Deprecated literal json input to :func:`read_json`. Moving forward the method only accepts file-like objects (:issue:`53330`)
- Deprecated positional indexing on :class:`Series` with :meth:`Series.__getitem__` and :meth:`Series.__setitem__`, in a future version ``ser[item]`` will *always* interpret ``item`` as a label, not a position (:issue:`50617`)

.. ---------------------------------------------------------------------------
Expand Down
62 changes: 7 additions & 55 deletions pandas/io/json/_json.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,13 +51,8 @@
from pandas.io.common import (
IOHandles,
dedup_names,
extension_to_compression,
file_exists,
get_handle,
is_fsspec_url,
is_potential_multi_index,
is_url,
stringify_path,
)
from pandas.io.json._normalize import convert_to_line_delimits
from pandas.io.json._table_schema import (
Expand Down Expand Up @@ -498,7 +493,7 @@ def read_json(
decompression_options=_shared_docs["decompression_options"] % "path_or_buf",
)
def read_json(
path_or_buf: FilePath | ReadBuffer[str] | ReadBuffer[bytes],
path_or_buf: ReadBuffer[str] | ReadBuffer[bytes],
*,
orient: str | None = None,
typ: Literal["frame", "series"] = "frame",
Expand All @@ -523,12 +518,7 @@ def read_json(

Parameters
----------
path_or_buf : a valid JSON str, path object or file-like object
Any valid string path is acceptable. The string could be a URL. Valid
URL schemes include http, ftp, s3, and file. For file URLs, a host is
expected. A local file could be:
``file://localhost/path/to/table.json``.

path_or_buf : a path object or file-like object
If you want to pass in a path object, pandas accepts any
``os.PathLike``.

Expand Down Expand Up @@ -750,6 +740,8 @@ def read_json(
}}\
'
"""
if isinstance(path_or_buf, str):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think this check is probably too restrictive. I think we want to allow

pd.read_json("path/to/file.json")

Going forward. We just don't want

pd.read_json("{json: object}") # or however it's spelt

So rather than checking here. The deprecation probably wants enforced in JsonReader, perhaps like this?

diff --git a/pandas/io/json/_json.py b/pandas/io/json/_json.py
index 8775c65f14..78f541d3df 100644
--- a/pandas/io/json/_json.py
+++ b/pandas/io/json/_json.py
@@ -18,6 +18,7 @@ from typing import (
     TypeVar,
     overload,
 )
+import warnings
 
 import numpy as np
 
@@ -30,6 +31,7 @@ from pandas._libs.tslibs import iNaT
 from pandas.compat._optional import import_optional_dependency
 from pandas.errors import AbstractMethodError
 from pandas.util._decorators import doc
+from pandas.util._exceptions import find_stack_level
 from pandas.util._validators import check_dtype_backend
 
 from pandas.core.dtypes.common import ensure_str
@@ -916,7 +918,14 @@ class JsonReader(abc.Iterator, Generic[FrameSeriesStrT]):
             and not file_exists(filepath_or_buffer)
         ):
             raise FileNotFoundError(f"File {filepath_or_buffer} does not exist")
-
+        else:
+            warnings.warn(
+                "Passing literal json to 'read_json' is deprecated and "
+                "will be removed in a future version. To read from a "
+                "literal string, wrap it in a 'StringIO' object.",
+                FutureWarning,
+                stacklevel=find_stack_level(),
+            )
         return filepath_or_buffer
 
     def _combine_lines(self, lines) -> str:

WDYT?

Copy link
Contributor Author

@rmhowe425 rmhowe425 May 22, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I see the value in doing that. Though, that doesn't seem to be what our repo maintainers are going after

Copy link
Member

@twoertwein twoertwein May 22, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I see the value in doing that. Though, that doesn't seem to be what our repo maintainers are going after

I'm quite sure @mroeschke wants to allow os.PathLike and str that represent file paths (that would be consistent with read/to_csv). My understanding of the issue is resolving the ambiguity of whether a string is a path or a "JSON-string" (that has caused issues in the past). When making changes that break current behavior (disallowing JSON-strings), we have to have at least one release with a deprecation warning.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ahh okay! I was thinking that they would want to turn strings into StringIO objects! And yeah I definitely understand that we would want to have at least one release with a deprecation warning! 💯

raise TypeError("cannot pass literal json to pandas.read_json")
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we can only add a warning for now and then with 3.0 (or 2.2?) remove support for json literals.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oh wow, that would be drastically different compared to the original ask in the GitHub issue. Though, that would follow along the same lines of what we have seen with DataFrame.concat and DataFrame.append

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Although I wrote the bug as ENH, my plan (although thanks for picking it up since I didn't get enough roundtoits until now), was to introduce a deprecation warning and then remove in an upcoming release. I think it's too user- facing to just remove immediately.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@wence- That makes perfect sense.

I may hit pause on this until for a day or two until we hear from mroeschke or another reviewer for official direction. I don't want to steer too far away from the original acceptance criteria / guidance in the Feature Description, although I think that they'll probably echo what you're suggesting 😄

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah @twoertwein and @wence- are correct here. We will need to add a warning to deprecate the current behavior of parsing string json in this PR before we can outright raise an error

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@mroeschke Awesome! Thanks!

if orient == "table" and dtype:
raise ValueError("cannot pass both dtype and orient='table'")
if orient == "table" and convert_axes:
Expand Down Expand Up @@ -868,8 +860,9 @@ def __init__(
)
self.data = filepath_or_buffer
elif self.engine == "ujson":
data = self._get_data_from_filepath(filepath_or_buffer)
self.data = self._preprocess_data(data)
self.handles = get_handle(filepath_or_buffer, "r")
self.data = self._preprocess_data(filepath_or_buffer)
self.close()

def _preprocess_data(self, data):
"""
Expand All @@ -887,47 +880,6 @@ def _preprocess_data(self, data):

return data

def _get_data_from_filepath(self, filepath_or_buffer):
"""
The function read_json accepts three input types:
1. filepath (string-like)
2. file-like object (e.g. open file object, StringIO)
3. JSON string

This method turns (1) into (2) to simplify the rest of the processing.
It returns input types (2) and (3) unchanged.

It raises FileNotFoundError if the input is a string ending in
one of .json, .json.gz, .json.bz2, etc. but no such file exists.
"""
# if it is a string but the file does not exist, it might be a JSON string
filepath_or_buffer = stringify_path(filepath_or_buffer)
if (
not isinstance(filepath_or_buffer, str)
or is_url(filepath_or_buffer)
or is_fsspec_url(filepath_or_buffer)
or file_exists(filepath_or_buffer)
):
self.handles = get_handle(
filepath_or_buffer,
"r",
encoding=self.encoding,
compression=self.compression,
storage_options=self.storage_options,
errors=self.encoding_errors,
)
filepath_or_buffer = self.handles.handle
elif (
isinstance(filepath_or_buffer, str)
and filepath_or_buffer.lower().endswith(
(".json",) + tuple(f".json{c}" for c in extension_to_compression)
)
and not file_exists(filepath_or_buffer)
):
raise FileNotFoundError(f"File {filepath_or_buffer} does not exist")

return filepath_or_buffer

def _combine_lines(self, lines) -> str:
"""
Combines a list of JSON objects into one JSON object.
Expand Down
Loading