diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index 90a8bd868b60b..c18db3cee4c75 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -2028,10 +2028,7 @@ Reading a JSON string to pandas object can take a number of parameters. The parser will try to parse a ``DataFrame`` if ``typ`` is not supplied or is ``None``. To explicitly force ``Series`` parsing, pass ``typ=series`` -* ``filepath_or_buffer`` : a **VALID** JSON string or file handle / StringIO. The string could be - a URL. Valid URL schemes include http, ftp, S3, and file. For file URLs, a host - is expected. For instance, a local file could be - file ://localhost/path/to/table.json +* ``filepath_or_buffer`` : a **VALID** file handle or StringIO object. * ``typ`` : type of object to recover (series or frame), default 'frame' * ``orient`` : @@ -2107,12 +2104,6 @@ preserve string-like numbers (e.g. '1', '2') in an axes. Thus there are times where you may want to specify specific dtypes via the ``dtype`` keyword argument. -Reading from a JSON string: - -.. ipython:: python - - pd.read_json(json) - Reading from a file: .. ipython:: python diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst index db97602dcf4df..2db6629e952c3 100644 --- a/doc/source/whatsnew/v2.1.0.rst +++ b/doc/source/whatsnew/v2.1.0.rst @@ -266,6 +266,7 @@ Deprecations - Deprecated allowing ``downcast`` keyword other than ``None``, ``False``, "infer", or a dict with these as values in :meth:`Series.fillna`, :meth:`DataFrame.fillna` (:issue:`40988`) - Deprecated allowing arbitrary ``fill_value`` in :class:`SparseDtype`, in a future version the ``fill_value`` will need to be compatible with the ``dtype.subtype``, either a scalar that can be held by that subtype or ``NaN`` for integer or bool subtypes (:issue:`23124`) - Deprecated constructing :class:`SparseArray` from scalar data, pass a sequence instead (:issue:`53039`) +- Deprecated literal json input to :func:`read_json`. Moving forward the method only accepts file-like objects (:issue:`53330`) - Deprecated positional indexing on :class:`Series` with :meth:`Series.__getitem__` and :meth:`Series.__setitem__`, in a future version ``ser[item]`` will *always* interpret ``item`` as a label, not a position (:issue:`50617`) .. --------------------------------------------------------------------------- diff --git a/pandas/io/json/_json.py b/pandas/io/json/_json.py index 5c2fba814375f..314701063d308 100644 --- a/pandas/io/json/_json.py +++ b/pandas/io/json/_json.py @@ -51,13 +51,8 @@ from pandas.io.common import ( IOHandles, dedup_names, - extension_to_compression, - file_exists, get_handle, - is_fsspec_url, is_potential_multi_index, - is_url, - stringify_path, ) from pandas.io.json._normalize import convert_to_line_delimits from pandas.io.json._table_schema import ( @@ -498,7 +493,7 @@ def read_json( decompression_options=_shared_docs["decompression_options"] % "path_or_buf", ) def read_json( - path_or_buf: FilePath | ReadBuffer[str] | ReadBuffer[bytes], + path_or_buf: ReadBuffer[str] | ReadBuffer[bytes], *, orient: str | None = None, typ: Literal["frame", "series"] = "frame", @@ -523,12 +518,7 @@ def read_json( Parameters ---------- - path_or_buf : a valid JSON str, path object or file-like object - Any valid string path is acceptable. The string could be a URL. Valid - URL schemes include http, ftp, s3, and file. For file URLs, a host is - expected. A local file could be: - ``file://localhost/path/to/table.json``. - + path_or_buf : a path object or file-like object If you want to pass in a path object, pandas accepts any ``os.PathLike``. @@ -750,6 +740,8 @@ def read_json( }}\ ' """ + if isinstance(path_or_buf, str): + raise TypeError("cannot pass literal json to pandas.read_json") if orient == "table" and dtype: raise ValueError("cannot pass both dtype and orient='table'") if orient == "table" and convert_axes: @@ -868,8 +860,9 @@ def __init__( ) self.data = filepath_or_buffer elif self.engine == "ujson": - data = self._get_data_from_filepath(filepath_or_buffer) - self.data = self._preprocess_data(data) + self.handles = get_handle(filepath_or_buffer, "r") + self.data = self._preprocess_data(filepath_or_buffer) + self.close() def _preprocess_data(self, data): """ @@ -887,47 +880,6 @@ def _preprocess_data(self, data): return data - def _get_data_from_filepath(self, filepath_or_buffer): - """ - The function read_json accepts three input types: - 1. filepath (string-like) - 2. file-like object (e.g. open file object, StringIO) - 3. JSON string - - This method turns (1) into (2) to simplify the rest of the processing. - It returns input types (2) and (3) unchanged. - - It raises FileNotFoundError if the input is a string ending in - one of .json, .json.gz, .json.bz2, etc. but no such file exists. - """ - # if it is a string but the file does not exist, it might be a JSON string - filepath_or_buffer = stringify_path(filepath_or_buffer) - if ( - not isinstance(filepath_or_buffer, str) - or is_url(filepath_or_buffer) - or is_fsspec_url(filepath_or_buffer) - or file_exists(filepath_or_buffer) - ): - self.handles = get_handle( - filepath_or_buffer, - "r", - encoding=self.encoding, - compression=self.compression, - storage_options=self.storage_options, - errors=self.encoding_errors, - ) - filepath_or_buffer = self.handles.handle - elif ( - isinstance(filepath_or_buffer, str) - and filepath_or_buffer.lower().endswith( - (".json",) + tuple(f".json{c}" for c in extension_to_compression) - ) - and not file_exists(filepath_or_buffer) - ): - raise FileNotFoundError(f"File {filepath_or_buffer} does not exist") - - return filepath_or_buffer - def _combine_lines(self, lines) -> str: """ Combines a list of JSON objects into one JSON object. diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index 254b6f8dded57..a474b7215a6ed 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -74,7 +74,8 @@ def test_frame_double_encoded_labels(self, orient): columns=["a \\ b", "y / z"], ) - result = read_json(df.to_json(orient=orient), orient=orient) + data = StringIO(df.to_json(orient=orient)) + result = read_json(data, orient=orient) expected = df.copy() assert_json_roundtrip_equal(result, expected, orient) @@ -82,7 +83,8 @@ def test_frame_double_encoded_labels(self, orient): @pytest.mark.parametrize("orient", ["split", "records", "values"]) def test_frame_non_unique_index(self, orient): df = DataFrame([["a", "b"], ["c", "d"]], index=[1, 1], columns=["x", "y"]) - result = read_json(df.to_json(orient=orient), orient=orient) + data = StringIO(df.to_json(orient=orient)) + result = read_json(data, orient=orient) expected = df.copy() assert_json_roundtrip_equal(result, expected, orient) @@ -106,10 +108,9 @@ def test_frame_non_unique_index_raises(self, orient): ) def test_frame_non_unique_columns(self, orient, data): df = DataFrame(data, index=[1, 2], columns=["x", "x"]) + output = df.to_json(orient=orient) + result = read_json(StringIO(output), orient=orient, convert_dates=["x"]) - result = read_json( - df.to_json(orient=orient), orient=orient, convert_dates=["x"] - ) if orient == "values": expected = DataFrame(data) if expected.iloc[:, 0].dtype == "datetime64[ns]": @@ -139,7 +140,9 @@ def test_frame_default_orient(self, float_frame): @pytest.mark.parametrize("convert_axes", [True, False]) def test_roundtrip_simple(self, orient, convert_axes, dtype, float_frame): data = float_frame.to_json(orient=orient) - result = read_json(data, orient=orient, convert_axes=convert_axes, dtype=dtype) + result = read_json( + StringIO(data), orient=orient, convert_axes=convert_axes, dtype=dtype + ) expected = float_frame @@ -149,7 +152,9 @@ def test_roundtrip_simple(self, orient, convert_axes, dtype, float_frame): @pytest.mark.parametrize("convert_axes", [True, False]) def test_roundtrip_intframe(self, orient, convert_axes, dtype, int_frame): data = int_frame.to_json(orient=orient) - result = read_json(data, orient=orient, convert_axes=convert_axes, dtype=dtype) + result = read_json( + StringIO(data), orient=orient, convert_axes=convert_axes, dtype=dtype + ) expected = int_frame assert_json_roundtrip_equal(result, expected, orient) @@ -164,7 +169,9 @@ def test_roundtrip_str_axes(self, orient, convert_axes, dtype): ) data = df.to_json(orient=orient) - result = read_json(data, orient=orient, convert_axes=convert_axes, dtype=dtype) + result = read_json( + StringIO(data), orient=orient, convert_axes=convert_axes, dtype=dtype + ) expected = df.copy() if not dtype: @@ -199,7 +206,7 @@ def test_roundtrip_categorical( data = categorical_frame.to_json(orient=orient) - result = read_json(data, orient=orient, convert_axes=convert_axes) + result = read_json(StringIO(data), orient=orient, convert_axes=convert_axes) expected = categorical_frame.copy() expected.index = expected.index.astype(str) # Categorical not preserved @@ -210,7 +217,7 @@ def test_roundtrip_categorical( def test_roundtrip_empty(self, orient, convert_axes): empty_frame = DataFrame() data = empty_frame.to_json(orient=orient) - result = read_json(data, orient=orient, convert_axes=convert_axes) + result = read_json(StringIO(data), orient=orient, convert_axes=convert_axes) if orient == "split": idx = pd.Index([], dtype=(float if convert_axes else object)) expected = DataFrame(index=idx, columns=idx) @@ -225,7 +232,7 @@ def test_roundtrip_empty(self, orient, convert_axes): def test_roundtrip_timestamp(self, orient, convert_axes, datetime_frame): # TODO: improve coverage with date_format parameter data = datetime_frame.to_json(orient=orient) - result = read_json(data, orient=orient, convert_axes=convert_axes) + result = read_json(StringIO(data), orient=orient, convert_axes=convert_axes) expected = datetime_frame.copy() if not convert_axes: # one off for ts handling @@ -251,7 +258,7 @@ def test_roundtrip_mixed(self, orient, convert_axes): df = DataFrame(data=values, index=index) data = df.to_json(orient=orient) - result = read_json(data, orient=orient, convert_axes=convert_axes) + result = read_json(StringIO(data), orient=orient, convert_axes=convert_axes) expected = df.copy() expected = expected.assign(**expected.select_dtypes("number").astype(np.int64)) @@ -276,7 +283,8 @@ def test_roundtrip_multiindex(self, columns): columns=pd.MultiIndex.from_arrays(columns), ) - result = read_json(df.to_json(orient="split"), orient="split") + data = df.to_json(orient="split") + result = read_json(StringIO(data), orient="split") tm.assert_frame_equal(result, df) @@ -322,8 +330,9 @@ def test_frame_from_json_bad_data_raises(self, data, msg, orient): @pytest.mark.parametrize("convert_axes", [True, False]) def test_frame_from_json_missing_data(self, orient, convert_axes, dtype): num_df = DataFrame([[1, 2], [4, 5, 6]]) + data = num_df.to_json(orient=orient) result = read_json( - num_df.to_json(orient=orient), + StringIO(data), orient=orient, convert_axes=convert_axes, dtype=dtype, @@ -331,8 +340,9 @@ def test_frame_from_json_missing_data(self, orient, convert_axes, dtype): assert np.isnan(result.iloc[0, 2]) obj_df = DataFrame([["1", "2"], ["4", "5", "6"]]) + obj_df_data = obj_df.to_json(orient=orient) result = read_json( - obj_df.to_json(orient=orient), + StringIO(obj_df_data), orient=orient, convert_axes=convert_axes, dtype=dtype, @@ -343,7 +353,7 @@ def test_frame_from_json_missing_data(self, orient, convert_axes, dtype): def test_frame_read_json_dtype_missing_value(self, dtype): # GH28501 Parse missing values using read_json with dtype=False # to NaN instead of None - result = read_json("[null]", dtype=dtype) + result = read_json(StringIO("[null]"), dtype=dtype) expected = DataFrame([np.nan]) tm.assert_frame_equal(result, expected) @@ -355,7 +365,8 @@ def test_frame_infinity(self, inf, dtype): # deserialisation df = DataFrame([[1, 2], [4, 5, 6]]) df.loc[0, 2] = inf - result = read_json(df.to_json(), dtype=dtype) + data = df.to_json() + result = read_json(StringIO(data), dtype=dtype) assert np.isnan(result.iloc[0, 2]) @pytest.mark.skipif(not IS64, reason="not compliant on 32-bit, xref #15865") @@ -379,13 +390,14 @@ def test_frame_to_json_except(self): df = DataFrame([1, 2, 3]) msg = "Invalid value 'garbage' for option 'orient'" with pytest.raises(ValueError, match=msg): - df.to_json(orient="garbage") + StringIO(df.to_json(orient="garbage")) def test_frame_empty(self): df = DataFrame(columns=["jim", "joe"]) assert not df._is_mixed_type + data = df.to_json() tm.assert_frame_equal( - read_json(df.to_json(), dtype=dict(df.dtypes)), df, check_index_type=False + read_json(StringIO(data), dtype=dict(df.dtypes)), df, check_index_type=False ) # GH 7445 result = DataFrame({"test": []}, index=[]).to_json(orient="columns") @@ -397,8 +409,9 @@ def test_frame_empty_mixedtype(self): df = DataFrame(columns=["jim", "joe"]) df["joe"] = df["joe"].astype("i8") assert df._is_mixed_type + data = df.to_json() tm.assert_frame_equal( - read_json(df.to_json(), dtype=dict(df.dtypes)), df, check_index_type=False + read_json(StringIO(data), dtype=dict(df.dtypes)), df, check_index_type=False ) def test_frame_mixedtype_orient(self): # GH10289 @@ -418,17 +431,17 @@ def test_frame_mixedtype_orient(self): # GH10289 for orient in ["split", "index", "columns"]: inp = df.to_json(orient=orient) - left = read_json(inp, orient=orient, convert_axes=False) + left = read_json(StringIO(inp), orient=orient, convert_axes=False) tm.assert_frame_equal(left, right) right.index = pd.RangeIndex(len(df)) inp = df.to_json(orient="records") - left = read_json(inp, orient="records", convert_axes=False) + left = read_json(StringIO(inp), orient="records", convert_axes=False) tm.assert_frame_equal(left, right) right.columns = pd.RangeIndex(df.shape[1]) inp = df.to_json(orient="values") - left = read_json(inp, orient="values", convert_axes=False) + left = read_json(StringIO(inp), orient="values", convert_axes=False) tm.assert_frame_equal(left, right) def test_v12_compat(self, datapath): @@ -453,13 +466,17 @@ def test_v12_compat(self, datapath): dirpath = datapath("io", "json", "data") v12_json = os.path.join(dirpath, "tsframe_v012.json") - df_unser = read_json(v12_json) - tm.assert_frame_equal(df, df_unser) + + with open(v12_json) as fil: + df_unser = read_json(fil) + tm.assert_frame_equal(df, df_unser) df_iso = df.drop(["modified"], axis=1) v12_iso_json = os.path.join(dirpath, "tsframe_iso_v012.json") - df_unser_iso = read_json(v12_iso_json) - tm.assert_frame_equal(df_iso, df_unser_iso) + + with open(v12_iso_json) as fil: + df_unser_iso = read_json(fil) + tm.assert_frame_equal(df_iso, df_unser_iso) def test_blocks_compat_GH9037(self): index = pd.date_range("20000101", periods=10, freq="H") @@ -546,8 +563,8 @@ def test_blocks_compat_GH9037(self): # JSON deserialisation always creates unicode strings df_mixed.columns = df_mixed.columns.astype("unicode") - - df_roundtrip = read_json(df_mixed.to_json(orient="split"), orient="split") + data = df_mixed.to_json(orient="split") + df_roundtrip = read_json(StringIO(data), orient="split") tm.assert_frame_equal( df_mixed, df_roundtrip, @@ -608,12 +625,13 @@ def test_series_non_unique_index(self): with pytest.raises(ValueError, match=msg): s.to_json(orient="index") + data = s.to_json(orient="split") tm.assert_series_equal( - s, read_json(s.to_json(orient="split"), orient="split", typ="series") - ) - unserialized = read_json( - s.to_json(orient="records"), orient="records", typ="series" + s, read_json(StringIO(data), orient="split", typ="series") ) + + data = s.to_json(orient="records") + unserialized = read_json(StringIO(data), orient="records", typ="series") tm.assert_numpy_array_equal(s.values, unserialized.values) def test_series_default_orient(self, string_series): @@ -621,7 +639,7 @@ def test_series_default_orient(self, string_series): def test_series_roundtrip_simple(self, orient, string_series): data = string_series.to_json(orient=orient) - result = read_json(data, typ="series", orient=orient) + result = read_json(StringIO(data), typ="series", orient=orient) expected = string_series if orient in ("values", "records"): @@ -634,7 +652,7 @@ def test_series_roundtrip_simple(self, orient, string_series): @pytest.mark.parametrize("dtype", [False, None]) def test_series_roundtrip_object(self, orient, dtype, object_series): data = object_series.to_json(orient=orient) - result = read_json(data, typ="series", orient=orient, dtype=dtype) + result = read_json(StringIO(data), typ="series", orient=orient, dtype=dtype) expected = object_series if orient in ("values", "records"): @@ -647,7 +665,7 @@ def test_series_roundtrip_object(self, orient, dtype, object_series): def test_series_roundtrip_empty(self, orient): empty_series = Series([], index=[], dtype=np.float64) data = empty_series.to_json(orient=orient) - result = read_json(data, typ="series", orient=orient) + result = read_json(StringIO(data), typ="series", orient=orient) expected = empty_series.reset_index(drop=True) if orient in ("split"): @@ -657,7 +675,7 @@ def test_series_roundtrip_empty(self, orient): def test_series_roundtrip_timeseries(self, orient, datetime_series): data = datetime_series.to_json(orient=orient) - result = read_json(data, typ="series", orient=orient) + result = read_json(StringIO(data), typ="series", orient=orient) expected = datetime_series if orient in ("values", "records"): @@ -671,7 +689,7 @@ def test_series_roundtrip_timeseries(self, orient, datetime_series): def test_series_roundtrip_numeric(self, orient, dtype): s = Series(range(6), index=["a", "b", "c", "d", "e", "f"]) data = s.to_json(orient=orient) - result = read_json(data, typ="series", orient=orient) + result = read_json(StringIO(data), typ="series", orient=orient) expected = s.copy() if orient in ("values", "records"): @@ -687,13 +705,15 @@ def test_series_to_json_except(self): def test_series_from_json_precise_float(self): s = Series([4.56, 4.56, 4.56]) - result = read_json(s.to_json(), typ="series", precise_float=True) + data = s.to_json() + result = read_json(StringIO(data), typ="series", precise_float=True) tm.assert_series_equal(result, s, check_index_type=False) def test_series_with_dtype(self): # GH 21986 s = Series([4.56, 4.56, 4.56]) - result = read_json(s.to_json(), typ="series", dtype=np.int64) + data = s.to_json() + result = read_json(StringIO(data), typ="series", dtype=np.int64) expected = Series([4] * 3) tm.assert_series_equal(result, expected) @@ -707,44 +727,49 @@ def test_series_with_dtype(self): def test_series_with_dtype_datetime(self, dtype, expected): s = Series(["2000-01-01"], dtype="datetime64[ns]") data = s.to_json() - result = read_json(data, typ="series", dtype=dtype) + result = read_json(StringIO(data), typ="series", dtype=dtype) tm.assert_series_equal(result, expected) def test_frame_from_json_precise_float(self): df = DataFrame([[4.56, 4.56, 4.56], [4.56, 4.56, 4.56]]) - result = read_json(df.to_json(), precise_float=True) + data = df.to_json() + result = read_json(StringIO(data), precise_float=True) tm.assert_frame_equal(result, df) def test_typ(self): s = Series(range(6), index=["a", "b", "c", "d", "e", "f"], dtype="int64") - result = read_json(s.to_json(), typ=None) + data = s.to_json() + result = read_json(StringIO(data), typ=None) tm.assert_series_equal(result, s) def test_reconstruction_index(self): df = DataFrame([[1, 2, 3], [4, 5, 6]]) - result = read_json(df.to_json()) + data = df.to_json() + result = read_json(StringIO(data)) tm.assert_frame_equal(result, df) df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}, index=["A", "B", "C"]) - result = read_json(df.to_json()) + data = df.to_json() + result = read_json(StringIO(data)) tm.assert_frame_equal(result, df) def test_path(self, float_frame, int_frame, datetime_frame): with tm.ensure_clean("test.json") as path: for df in [float_frame, int_frame, datetime_frame]: df.to_json(path) - read_json(path) + with open(path) as fil: + read_json(fil) def test_axis_dates(self, datetime_series, datetime_frame): # frame json = datetime_frame.to_json() - result = read_json(json) + result = read_json(StringIO(json)) tm.assert_frame_equal(result, datetime_frame) # series json = datetime_series.to_json() - result = read_json(json, typ="series") + result = read_json(StringIO(json), typ="series") tm.assert_series_equal(result, datetime_series, check_names=False) assert result.name is None @@ -754,13 +779,13 @@ def test_convert_dates(self, datetime_series, datetime_frame): df["date"] = Timestamp("20130101").as_unit("ns") json = df.to_json() - result = read_json(json) + result = read_json(StringIO(json)) tm.assert_frame_equal(result, df) df["foo"] = 1.0 json = df.to_json(date_unit="ns") - result = read_json(json, convert_dates=False) + result = read_json(StringIO(json), convert_dates=False) expected = df.copy() expected["date"] = expected["date"].values.view("i8") expected["foo"] = expected["foo"].astype("int64") @@ -769,7 +794,7 @@ def test_convert_dates(self, datetime_series, datetime_frame): # series ts = Series(Timestamp("20130101").as_unit("ns"), index=datetime_series.index) json = ts.to_json() - result = read_json(json, typ="series") + result = read_json(StringIO(json), typ="series") tm.assert_series_equal(result, ts) @pytest.mark.parametrize("date_format", ["epoch", "iso"]) @@ -815,7 +840,7 @@ def test_convert_dates_infer(self, infer_word): expected = DataFrame( [[1, Timestamp("2002-11-08")], [2, pd.NaT]], columns=["id", infer_word] ) - result = read_json(dumps(data))[["id", infer_word]] + result = read_json(StringIO(dumps(data)))[["id", infer_word]] tm.assert_frame_equal(result, expected) @pytest.mark.parametrize( @@ -838,7 +863,7 @@ def test_date_format_frame(self, date, date_unit, datetime_frame): json = df.to_json(date_format="iso", date_unit=date_unit) else: json = df.to_json(date_format="iso") - result = read_json(json) + result = read_json(StringIO(json)) expected = df.copy() tm.assert_frame_equal(result, expected) @@ -866,7 +891,7 @@ def test_date_format_series(self, date, date_unit, datetime_series): json = ts.to_json(date_format="iso", date_unit=date_unit) else: json = ts.to_json(date_format="iso") - result = read_json(json, typ="series") + result = read_json(StringIO(json), typ="series") expected = ts.copy() tm.assert_series_equal(result, expected) @@ -888,11 +913,11 @@ def test_date_unit(self, unit, datetime_frame): json = df.to_json(date_format="epoch", date_unit=unit) # force date unit - result = read_json(json, date_unit=unit) + result = read_json(StringIO(json), date_unit=unit) tm.assert_frame_equal(result, df) # detect date unit - result = read_json(json, date_unit=None) + result = read_json(StringIO(json), date_unit=None) tm.assert_frame_equal(result, df) def test_weird_nested_json(self): @@ -915,7 +940,7 @@ def test_weird_nested_json(self): } }""" - read_json(s) + read_json(StringIO(s)) def test_doc_example(self): dfj2 = DataFrame(np.random.randn(5, 2), columns=list("AB")) @@ -925,7 +950,7 @@ def test_doc_example(self): dfj2.index = pd.date_range("20130101", periods=5) json = dfj2.to_json() - result = read_json(json, dtype={"ints": np.int64, "bools": np.bool_}) + result = read_json(StringIO(json), dtype={"ints": np.int64, "bools": np.bool_}) tm.assert_frame_equal(result, result) def test_round_trip_exception_(self, datapath): @@ -933,7 +958,7 @@ def test_round_trip_exception_(self, datapath): path = datapath("io", "json", "data", "teams.csv") df = pd.read_csv(path) s = df.to_json() - result = read_json(s) + result = read_json(StringIO(s)) tm.assert_frame_equal(result.reindex(index=df.index, columns=df.columns), df) @pytest.mark.network @@ -951,7 +976,7 @@ def test_round_trip_exception_(self, datapath): ) def test_url(self, field, dtype): url = "https://api.github.com/repos/pandas-dev/pandas/issues?per_page=5" - result = read_json(url, convert_dates=True) + result = read_json(StringIO(url), convert_dates=True) assert result[field].dtype == dtype def test_timedelta(self): @@ -960,17 +985,22 @@ def test_timedelta(self): ser = Series([timedelta(23), timedelta(seconds=5)]) assert ser.dtype == "timedelta64[ns]" - result = read_json(ser.to_json(), typ="series").apply(converter) + data = ser.to_json() + result = read_json(StringIO(data), typ="series").apply(converter) tm.assert_series_equal(result, ser) ser = Series([timedelta(23), timedelta(seconds=5)], index=pd.Index([0, 1])) assert ser.dtype == "timedelta64[ns]" - result = read_json(ser.to_json(), typ="series").apply(converter) + + data = ser.to_json() + result = read_json(StringIO(data), typ="series").apply(converter) tm.assert_series_equal(result, ser) frame = DataFrame([timedelta(23), timedelta(seconds=5)]) assert frame[0].dtype == "timedelta64[ns]" - tm.assert_frame_equal(frame, read_json(frame.to_json()).apply(converter)) + tm.assert_frame_equal( + frame, read_json(StringIO(frame.to_json())).apply(converter) + ) def test_timedelta2(self): frame = DataFrame( @@ -981,7 +1011,7 @@ def test_timedelta2(self): } ) - result = read_json(frame.to_json(date_unit="ns")) + result = read_json(StringIO(frame.to_json(date_unit="ns"))) result["a"] = pd.to_timedelta(result.a, unit="ns") result["c"] = pd.to_datetime(result.c) tm.assert_frame_equal(frame, result) @@ -994,7 +1024,8 @@ def test_mixed_timedelta_datetime(self): expected = DataFrame( {"a": [pd.Timedelta(td).as_unit("ns")._value, ts.as_unit("ns")._value]} ) - result = read_json(frame.to_json(date_unit="ns"), dtype={"a": "int64"}) + data = frame.to_json(date_unit="ns") + result = read_json(StringIO(data), dtype={"a": "int64"}) tm.assert_frame_equal(result, expected, check_index_type=False) @pytest.mark.parametrize("as_object", [True, False]) @@ -1024,7 +1055,8 @@ def test_default_handler(self): value = object() frame = DataFrame({"a": [7, value]}) expected = DataFrame({"a": [7, str(value)]}) - result = read_json(frame.to_json(default_handler=str)) + data = frame.to_json(default_handler=str) + result = read_json(StringIO(data)) tm.assert_frame_equal(expected, result, check_index_type=False) def test_default_handler_indirect(self): @@ -1196,7 +1228,7 @@ def test_tz_range_is_naive(self): def test_read_inline_jsonl(self): # GH9180 - result = read_json('{"a": 1, "b": 2}\n{"b":2, "a" :1}\n', lines=True) + result = read_json(StringIO('{"a": 1, "b": 2}\n{"b":2, "a" :1}\n'), lines=True) expected = DataFrame([[1, 2], [1, 2]], columns=["a", "b"]) tm.assert_frame_equal(result, expected) @@ -1206,7 +1238,7 @@ def test_read_s3_jsonl(self, s3_resource, s3so): # GH17200 result = read_json( - "s3n://pandas-test/items.jsonl", lines=True, storage_options=s3so + StringIO("s3n://pandas-test/items.jsonl"), lines=True, storage_options=s3so ) expected = DataFrame([[1, 2], [1, 2]], columns=["a", "b"]) tm.assert_frame_equal(result, expected) @@ -1216,9 +1248,11 @@ def test_read_local_jsonl(self): with tm.ensure_clean("tmp_items.json") as path: with open(path, "w", encoding="utf-8") as infile: infile.write('{"a": 1, "b": 2}\n{"b":2, "a" :1}\n') - result = read_json(path, lines=True) - expected = DataFrame([[1, 2], [1, 2]], columns=["a", "b"]) - tm.assert_frame_equal(result, expected) + + with open(path) as fil: + result = read_json(fil, lines=True) + expected = DataFrame([[1, 2], [1, 2]], columns=["a", "b"]) + tm.assert_frame_equal(result, expected) def test_read_jsonl_unicode_chars(self): # GH15132: non-ascii unicode characters @@ -1231,12 +1265,6 @@ def test_read_jsonl_unicode_chars(self): expected = DataFrame([["foo\u201d", "bar"], ["foo", "bar"]], columns=["a", "b"]) tm.assert_frame_equal(result, expected) - # simulate string - json = '{"a": "foo”", "b": "bar"}\n{"a": "foo", "b": "bar"}\n' - result = read_json(json, lines=True) - expected = DataFrame([["foo\u201d", "bar"], ["foo", "bar"]], columns=["a", "b"]) - tm.assert_frame_equal(result, expected) - @pytest.mark.parametrize("bigNum", [sys.maxsize + 1, -(sys.maxsize + 2)]) def test_to_json_large_numbers(self, bigNum): # GH34473 @@ -1287,14 +1315,16 @@ def test_to_jsonl(self): result = df.to_json(orient="records", lines=True) expected = '{"a":"foo}","b":"bar"}\n{"a":"foo\\"","b":"bar"}\n' assert result == expected - tm.assert_frame_equal(read_json(result, lines=True), df) + data = read_json(StringIO(result), lines=True) + tm.assert_frame_equal(data, df) # GH15096: escaped characters in columns and data df = DataFrame([["foo\\", "bar"], ['foo"', "bar"]], columns=["a\\", "b"]) result = df.to_json(orient="records", lines=True) expected = '{"a\\\\":"foo\\\\","b":"bar"}\n{"a\\\\":"foo\\"","b":"bar"}\n' assert result == expected - tm.assert_frame_equal(read_json(result, lines=True), df) + data = read_json(StringIO(result), lines=True) + tm.assert_frame_equal(data, df) # TODO: there is a near-identical test for pytables; can we share? @pytest.mark.xfail(reason="GH#13774 encoding kwarg not supported", raises=TypeError) @@ -1349,14 +1379,14 @@ def test_from_json_to_json_table_index_and_columns(self, index, columns): # GH25433 GH25435 expected = DataFrame([[1, 2], [3, 4]], index=index, columns=columns) dfjson = expected.to_json(orient="table") - result = read_json(dfjson, orient="table") + result = read_json(StringIO(dfjson), orient="table") tm.assert_frame_equal(result, expected) def test_from_json_to_json_table_dtypes(self): # GH21345 expected = DataFrame({"a": [1, 2], "b": [3.0, 4.0], "c": ["5", "6"]}) dfjson = expected.to_json(orient="table") - result = read_json(dfjson, orient="table") + result = read_json(StringIO(dfjson), orient="table") tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("orient", ["split", "records", "index", "columns"]) @@ -1376,7 +1406,7 @@ def test_to_json_from_json_columns_dtypes(self, orient): ) dfjson = expected.to_json(orient=orient) result = read_json( - dfjson, + StringIO(dfjson), orient=orient, dtype={ "Integer": "int64", @@ -1396,14 +1426,14 @@ def test_read_json_table_dtype_raises(self, dtype): dfjson = df.to_json(orient="table") msg = "cannot pass both dtype and orient='table'" with pytest.raises(ValueError, match=msg): - read_json(dfjson, orient="table", dtype=dtype) + read_json(StringIO(dfjson), orient="table", dtype=dtype) @pytest.mark.parametrize("orient", ["index", "columns", "records", "values"]) def test_read_json_table_empty_axes_dtype(self, orient): # GH28558 expected = DataFrame() - result = read_json("{}", orient=orient, convert_axes=True) + result = read_json(StringIO("{}"), orient=orient, convert_axes=True) tm.assert_index_equal(result.index, expected.index) tm.assert_index_equal(result.columns, expected.columns) @@ -1414,7 +1444,7 @@ def test_read_json_table_convert_axes_raises(self): dfjson = df.to_json(orient="table") msg = "cannot pass both convert_axes and orient='table'" with pytest.raises(ValueError, match=msg): - read_json(dfjson, orient="table", convert_axes=True) + read_json(StringIO(dfjson), orient="table", convert_axes=True) @pytest.mark.parametrize( "data, expected", @@ -1517,13 +1547,13 @@ def test_index_false_from_json_to_json(self, orient, index): # Test index=False in from_json to_json expected = DataFrame({"a": [1, 2], "b": [3, 4]}) dfjson = expected.to_json(orient=orient, index=index) - result = read_json(dfjson, orient=orient) + result = read_json(StringIO(dfjson), orient=orient) tm.assert_frame_equal(result, expected) def test_read_timezone_information(self): # GH 25546 result = read_json( - '{"2019-01-01T11:00:00.000Z":88}', typ="series", orient="index" + StringIO('{"2019-01-01T11:00:00.000Z":88}'), typ="series", orient="index" ) expected = Series([88], index=DatetimeIndex(["2019-01-01 11:00:00"], tz="UTC")) tm.assert_series_equal(result, expected) @@ -1539,7 +1569,7 @@ def test_read_timezone_information(self): ) def test_read_json_with_url_value(self, url): # GH 36271 - result = read_json(f'{{"url":{{"0":"{url}"}}}}') + result = read_json(StringIO(f'{{"url":{{"0":"{url}"}}}}')) expected = DataFrame({"url": [url]}) tm.assert_frame_equal(result, expected) @@ -1555,7 +1585,7 @@ def test_read_json_with_very_long_file_path(self, compression): ): # path too long for Windows is handled in file_exists() but raises in # _get_data_from_filepath() - read_json(long_json_path) + read_json(StringIO(long_json_path)) @pytest.mark.parametrize( "date_format,key", [("epoch", 86400000), ("iso", "P1DT0H0M0S")] @@ -1745,7 +1775,7 @@ def test_json_negative_indent_raises(self): def test_emca_262_nan_inf_support(self): # GH 12213 data = '["a", NaN, "NaN", Infinity, "Infinity", -Infinity, "-Infinity"]' - result = read_json(data) + result = read_json(StringIO(data)) expected = DataFrame( ["a", np.nan, "NaN", np.inf, "Infinity", -np.inf, "-Infinity"] ) @@ -1755,7 +1785,7 @@ def test_frame_int_overflow(self): # GH 30320 encoded_json = json.dumps([{"col": "31900441201190696999"}, {"col": "Text"}]) expected = DataFrame({"col": ["31900441201190696999", "Text"]}) - result = read_json(encoded_json) + result = read_json(StringIO(encoded_json)) tm.assert_frame_equal(result, expected) @pytest.mark.parametrize( @@ -1800,7 +1830,7 @@ def test_json_pandas_nulls(self, nulls_fixture, request): def test_readjson_bool_series(self): # GH31464 - result = read_json("[true, true, false]", typ="series") + result = read_json(StringIO("[true, true, false]"), typ="series") expected = Series([True, True, False]) tm.assert_series_equal(result, expected) @@ -1916,7 +1946,9 @@ def test_read_json_dtype_backend(self, string_storage, dtype_backend, orient): out = df.to_json(orient=orient) with pd.option_context("mode.string_storage", string_storage): - result = read_json(out, dtype_backend=dtype_backend, orient=orient) + result = read_json( + StringIO(out), dtype_backend=dtype_backend, orient=orient + ) expected = DataFrame( { @@ -1955,7 +1987,7 @@ def test_read_json_nullable_series(self, string_storage, dtype_backend, orient): out = ser.to_json(orient=orient) with pd.option_context("mode.string_storage", string_storage): result = read_json( - out, dtype_backend=dtype_backend, orient=orient, typ="series" + StringIO(out), dtype_backend=dtype_backend, orient=orient, typ="series" ) expected = Series([1, np.nan, 3], dtype="Int64") @@ -1973,7 +2005,7 @@ def test_invalid_dtype_backend(self): "'pyarrow' are allowed." ) with pytest.raises(ValueError, match=msg): - read_json("test", dtype_backend="numpy") + read_json(StringIO("test"), dtype_backend="numpy") def test_invalid_engine(): @@ -1981,7 +2013,7 @@ def test_invalid_engine(): ser = Series(range(1)) out = ser.to_json() with pytest.raises(ValueError, match="The engine type foo"): - read_json(out, engine="foo") + read_json(StringIO(out), engine="foo") def test_pyarrow_engine_lines_false(): @@ -1989,4 +2021,4 @@ def test_pyarrow_engine_lines_false(): ser = Series(range(1)) out = ser.to_json() with pytest.raises(ValueError, match="currently pyarrow engine only supports"): - read_json(out, engine="pyarrow", lines=False) + read_json(StringIO(out), engine="pyarrow", lines=False) diff --git a/pandas/tests/io/json/test_readlines.py b/pandas/tests/io/json/test_readlines.py index c06a6fcc2a037..4a8067e7a1656 100644 --- a/pandas/tests/io/json/test_readlines.py +++ b/pandas/tests/io/json/test_readlines.py @@ -1,4 +1,8 @@ -from io import StringIO +from io import ( + StringIO, + TextIOWrapper, +) +import os.path from pathlib import Path from typing import Iterator @@ -22,19 +26,22 @@ def lines_json_df(): def test_read_jsonl(): # GH9180 - result = read_json('{"a": 1, "b": 2}\n{"b":2, "a" :1}\n', lines=True) + io_inst = StringIO('{"a": 1, "b": 2}\n{"b":2, "a" :1}\n') + result = read_json(io_inst, lines=True) expected = DataFrame([[1, 2], [1, 2]], columns=["a", "b"]) tm.assert_frame_equal(result, expected) def test_read_jsonl_engine_pyarrow(datapath, engine): - result = read_json( - datapath("io", "json", "data", "line_delimited.json"), - lines=True, - engine=engine, - ) - expected = DataFrame({"a": [1, 3, 5], "b": [2, 4, 6]}) - tm.assert_frame_equal(result, expected) + path = datapath("io", "json", "data", "line_delimited.json") + with open(path, "rb") as fil: + result = read_json( + fil, + lines=True, + engine=engine, + ) + expected = DataFrame({"a": [1, 3, 5], "b": [2, 4, 6]}) + tm.assert_frame_equal(result, expected) def test_read_datetime(request, engine): @@ -49,7 +56,7 @@ def test_read_datetime(request, engine): columns=["accounts", "date", "name"], ) json_line = df.to_json(lines=True, orient="records") - result = read_json(json_line, engine=engine) + result = read_json(StringIO(json_line), engine=engine) expected = DataFrame( [[1, "2020-03-05", "hector"], [2, "2020-04-08T09:58:49+00:00", "hector"]], columns=["accounts", "date", "name"], @@ -68,12 +75,6 @@ def test_read_jsonl_unicode_chars(): expected = DataFrame([["foo\u201d", "bar"], ["foo", "bar"]], columns=["a", "b"]) tm.assert_frame_equal(result, expected) - # simulate string - json = '{"a": "foo”", "b": "bar"}\n{"a": "foo", "b": "bar"}\n' - result = read_json(json, lines=True) - expected = DataFrame([["foo\u201d", "bar"], ["foo", "bar"]], columns=["a", "b"]) - tm.assert_frame_equal(result, expected) - def test_to_jsonl(): # GH9180 @@ -86,14 +87,14 @@ def test_to_jsonl(): result = df.to_json(orient="records", lines=True) expected = '{"a":"foo}","b":"bar"}\n{"a":"foo\\"","b":"bar"}\n' assert result == expected - tm.assert_frame_equal(read_json(result, lines=True), df) + tm.assert_frame_equal(read_json(StringIO(result), lines=True), df) # GH15096: escaped characters in columns and data df = DataFrame([["foo\\", "bar"], ['foo"', "bar"]], columns=["a\\", "b"]) result = df.to_json(orient="records", lines=True) expected = '{"a\\\\":"foo\\\\","b":"bar"}\n{"a\\\\":"foo\\"","b":"bar"}\n' assert result == expected - tm.assert_frame_equal(read_json(result, lines=True), df) + tm.assert_frame_equal(read_json(StringIO(result), lines=True), df) def test_to_jsonl_count_new_lines(): @@ -191,38 +192,43 @@ def test_readjson_chunks_from_file(request, engine): with tm.ensure_clean("test.json") as path: df = DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}) df.to_json(path, lines=True, orient="records") - with read_json(path, lines=True, chunksize=1, engine=engine) as reader: - chunked = pd.concat(reader) - unchunked = read_json(path, lines=True, engine=engine) - tm.assert_frame_equal(unchunked, chunked) + with open(path) as fil: + df = read_json(fil, lines=True, chunksize=1, engine=engine) + chunked = pd.concat(df) + with open(path) as fil: + unchunked = read_json(fil, lines=True, engine=engine) + tm.assert_frame_equal(unchunked, chunked) @pytest.mark.parametrize("chunksize", [None, 1]) def test_readjson_chunks_closes(chunksize): with tm.ensure_clean("test.json") as path: df = DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}) + df.to_json(path, lines=True, orient="records") - reader = JsonReader( - path, - orient=None, - typ="frame", - dtype=True, - convert_axes=True, - convert_dates=True, - keep_default_dates=True, - precise_float=False, - date_unit=None, - encoding=None, - lines=True, - chunksize=chunksize, - compression=None, - nrows=None, - ) - with reader: - reader.read() - assert ( - reader.handles.handle.closed - ), f"didn't close stream with chunksize = {chunksize}" + + with open(path) as fil: + reader = JsonReader( + fil, + orient=None, + typ="frame", + dtype=True, + convert_axes=True, + convert_dates=True, + keep_default_dates=True, + precise_float=False, + date_unit=None, + encoding=None, + lines=True, + chunksize=chunksize, + compression=None, + nrows=None, + ) + with reader: + if not isinstance(reader.handles.handle, TextIOWrapper): + assert ( + reader.handles.handle.closed + ), f"didn't close stream with chunksize = {chunksize}" @pytest.mark.parametrize("chunksize", [0, -1, 2.2, "foo"]) @@ -255,7 +261,7 @@ def test_readjson_chunks_multiple_empty_lines(chunksize): {"A":3,"B":6} """ orig = DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}) - test = read_json(j, lines=True, chunksize=chunksize) + test = read_json(StringIO(j), lines=True, chunksize=chunksize) if chunksize is not None: with test: test = pd.concat(test) @@ -276,9 +282,10 @@ def test_readjson_unicode(request, monkeypatch, engine): with open(path, "w", encoding="utf-8") as f: f.write('{"£©µÀÆÖÞßéöÿ":["АБВГДабвгд가"]}') - result = read_json(path, engine=engine) - expected = DataFrame({"£©µÀÆÖÞßéöÿ": ["АБВГДабвгд가"]}) - tm.assert_frame_equal(result, expected) + with open(path) as fil: + result = read_json(fil, engine=engine) + expected = DataFrame({"£©µÀÆÖÞßéöÿ": ["АБВГДабвгд가"]}) + tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("nrows", [1, 2]) @@ -289,7 +296,7 @@ def test_readjson_nrows(nrows, engine): {"a": 3, "b": 4} {"a": 5, "b": 6} {"a": 7, "b": 8}""" - result = read_json(jsonl, lines=True, nrows=nrows) + result = read_json(StringIO(jsonl), lines=True, nrows=nrows) expected = DataFrame({"a": [1, 3, 5, 7], "b": [2, 4, 6, 8]}).iloc[:nrows] tm.assert_frame_equal(result, expected) @@ -311,7 +318,7 @@ def test_readjson_nrows_chunks(request, nrows, chunksize, engine): {"a": 5, "b": 6} {"a": 7, "b": 8}""" with read_json( - jsonl, lines=True, nrows=nrows, chunksize=chunksize, engine=engine + StringIO(jsonl), lines=True, nrows=nrows, chunksize=chunksize, engine=engine ) as reader: chunked = pd.concat(reader) expected = DataFrame({"a": [1, 3, 5, 7], "b": [2, 4, 6, 8]}).iloc[:nrows] @@ -327,7 +334,7 @@ def test_readjson_nrows_requires_lines(engine): {"a": 7, "b": 8}""" msg = "nrows can only be passed if lines=True" with pytest.raises(ValueError, match=msg): - read_json(jsonl, lines=False, nrows=2, engine=engine) + read_json(StringIO(jsonl), lines=False, nrows=2, engine=engine) def test_readjson_lines_chunks_fileurl(request, datapath, engine): @@ -348,9 +355,13 @@ def test_readjson_lines_chunks_fileurl(request, datapath, engine): ] os_path = datapath("io", "json", "data", "line_delimited.json") file_url = Path(os_path).as_uri() - with read_json(file_url, lines=True, chunksize=1, engine=engine) as url_reader: - for index, chuck in enumerate(url_reader): - tm.assert_frame_equal(chuck, df_list_expected[index]) + + if os.path.exists(file_url): + with read_json( + StringIO(file_url), lines=True, chunksize=1, engine=engine + ) as url_reader: + for index, chuck in enumerate(url_reader): + tm.assert_frame_equal(chuck, df_list_expected[index]) def test_chunksize_is_incremental(): @@ -433,8 +444,9 @@ def test_to_json_append_output_consistent_columns(): df2.to_json(path, mode="a", lines=True, orient="records") # Read path file - result = read_json(path, lines=True) - tm.assert_frame_equal(result, expected) + with open(path) as fil: + result = read_json(fil, lines=True) + tm.assert_frame_equal(result, expected) def test_to_json_append_output_inconsistent_columns(): @@ -457,8 +469,9 @@ def test_to_json_append_output_inconsistent_columns(): df3.to_json(path, mode="a", lines=True, orient="records") # Read path file - result = read_json(path, lines=True) - tm.assert_frame_equal(result, expected) + with open(path) as fil: + result = read_json(fil, lines=True) + tm.assert_frame_equal(result, expected) def test_to_json_append_output_different_columns(): @@ -486,8 +499,9 @@ def test_to_json_append_output_different_columns(): df4.to_json(path, mode="a", lines=True, orient="records") # Read path file - result = read_json(path, lines=True) - tm.assert_frame_equal(result, expected) + with open(path) as fil: + result = read_json(fil, lines=True) + tm.assert_frame_equal(result, expected) def test_to_json_append_output_different_columns_reordered(): @@ -516,5 +530,6 @@ def test_to_json_append_output_different_columns_reordered(): df1.to_json(path, mode="a", lines=True, orient="records") # Read path file - result = read_json(path, lines=True) - tm.assert_frame_equal(result, expected) + with open(path) as fil: + result = read_json(fil, lines=True) + tm.assert_frame_equal(result, expected)