diff --git a/pandas/tests/io/json/conftest.py b/pandas/tests/io/json/conftest.py new file mode 100644 index 0000000000000..4e848cd48b42d --- /dev/null +++ b/pandas/tests/io/json/conftest.py @@ -0,0 +1,9 @@ +import pytest + + +@pytest.fixture(params=["split", "records", "index", "columns", "values"]) +def orient(request): + """ + Fixture for orients excluding the table format. + """ + return request.param diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index 9842a706f43d7..2ef8244f488ae 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -7,16 +7,14 @@ import numpy as np import pytest -from pandas.compat import is_platform_32bit +from pandas.compat import PY35, is_platform_32bit import pandas.util._test_decorators as td import pandas as pd from pandas import DataFrame, DatetimeIndex, Series, Timestamp, read_json import pandas.util.testing as tm from pandas.util.testing import ( - assert_almost_equal, assert_frame_equal, - assert_index_equal, assert_series_equal, ensure_clean, network, @@ -82,76 +80,79 @@ def setup(self, datapath): del self.tsframe del self.mixed_frame - def test_frame_double_encoded_labels(self): + def test_frame_double_encoded_labels(self, orient): df = DataFrame( [["a", "b"], ["c", "d"]], index=['index " 1', "index / 2"], columns=["a \\ b", "y / z"], ) - assert_frame_equal(df, read_json(df.to_json(orient="split"), orient="split")) - assert_frame_equal( - df, read_json(df.to_json(orient="columns"), orient="columns") - ) - assert_frame_equal(df, read_json(df.to_json(orient="index"), orient="index")) - df_unser = read_json(df.to_json(orient="records"), orient="records") - assert_index_equal(df.columns, df_unser.columns) - tm.assert_numpy_array_equal(df.values, df_unser.values) - - def test_frame_non_unique_index(self): - df = DataFrame([["a", "b"], ["c", "d"]], index=[1, 1], columns=["x", "y"]) - - msg = "DataFrame index must be unique for orient='index'" - with pytest.raises(ValueError, match=msg): - df.to_json(orient="index") - msg = "DataFrame index must be unique for orient='columns'" - with pytest.raises(ValueError, match=msg): - df.to_json(orient="columns") + result = read_json(df.to_json(orient=orient), orient=orient) + expected = df.copy() - assert_frame_equal(df, read_json(df.to_json(orient="split"), orient="split")) - unser = read_json(df.to_json(orient="records"), orient="records") - tm.assert_index_equal(df.columns, unser.columns) - tm.assert_almost_equal(df.values, unser.values) - unser = read_json(df.to_json(orient="values"), orient="values") - tm.assert_numpy_array_equal(df.values, unser.values) + if orient == "records" or orient == "values": + expected = expected.reset_index(drop=True) + if orient == "values": + expected.columns = range(len(expected.columns)) - def test_frame_non_unique_columns(self): - df = DataFrame([["a", "b"], ["c", "d"]], index=[1, 2], columns=["x", "x"]) + assert_frame_equal(result, expected) - msg = "DataFrame columns must be unique for orient='index'" - with pytest.raises(ValueError, match=msg): - df.to_json(orient="index") - msg = "DataFrame columns must be unique for orient='columns'" - with pytest.raises(ValueError, match=msg): - df.to_json(orient="columns") - msg = "DataFrame columns must be unique for orient='records'" - with pytest.raises(ValueError, match=msg): - df.to_json(orient="records") + @pytest.mark.parametrize("orient", ["split", "records", "values"]) + def test_frame_non_unique_index(self, orient): + df = DataFrame([["a", "b"], ["c", "d"]], index=[1, 1], columns=["x", "y"]) + result = read_json(df.to_json(orient=orient), orient=orient) + expected = df.copy() - assert_frame_equal( - df, read_json(df.to_json(orient="split"), orient="split", dtype=False) - ) - unser = read_json(df.to_json(orient="values"), orient="values") - tm.assert_numpy_array_equal(df.values, unser.values) + if orient == "records" or orient == "values": + expected = expected.reset_index(drop=True) + if orient == "values": + expected.columns = range(len(expected.columns)) - # GH4377; duplicate columns not processing correctly - df = DataFrame([["a", "b"], ["c", "d"]], index=[1, 2], columns=["x", "y"]) - result = read_json(df.to_json(orient="split"), orient="split") - assert_frame_equal(result, df) + assert_frame_equal(result, expected) - def _check(df): - result = read_json( - df.to_json(orient="split"), orient="split", convert_dates=["x"] - ) - assert_frame_equal(result, df) + @pytest.mark.parametrize("orient", ["index", "columns"]) + def test_frame_non_unique_index_raises(self, orient): + df = DataFrame([["a", "b"], ["c", "d"]], index=[1, 1], columns=["x", "y"]) + msg = "DataFrame index must be unique for orient='{}'".format(orient) + with pytest.raises(ValueError, match=msg): + df.to_json(orient=orient) - for o in [ + @pytest.mark.parametrize("orient", ["split", "values"]) + @pytest.mark.parametrize( + "data", + [ [["a", "b"], ["c", "d"]], [[1.5, 2.5], [3.5, 4.5]], [[1, 2.5], [3, 4.5]], [[Timestamp("20130101"), 3.5], [Timestamp("20130102"), 4.5]], - ]: - _check(DataFrame(o, index=[1, 2], columns=["x", "x"])) + ], + ) + def test_frame_non_unique_columns(self, orient, data): + df = DataFrame(data, index=[1, 2], columns=["x", "x"]) + + result = read_json( + df.to_json(orient=orient), orient=orient, convert_dates=["x"] + ) + if orient == "values": + expected = pd.DataFrame(data) + if expected.iloc[:, 0].dtype == "datetime64[ns]": + # orient == "values" by default will write Timestamp objects out + # in milliseconds; these are internally stored in nanosecond, + # so divide to get where we need + # TODO: a to_epoch method would also solve; see GH 14772 + expected.iloc[:, 0] = expected.iloc[:, 0].astype(np.int64) // 1000000 + elif orient == "split": + expected = df + + assert_frame_equal(result, expected) + + @pytest.mark.parametrize("orient", ["index", "columns", "records"]) + def test_frame_non_unique_columns_raises(self, orient): + df = DataFrame([["a", "b"], ["c", "d"]], index=[1, 2], columns=["x", "x"]) + + msg = "DataFrame columns must be unique for orient='{}'".format(orient) + with pytest.raises(ValueError, match=msg): + df.to_json(orient=orient) def test_frame_from_json_to_json(self): def _check_orient( @@ -470,101 +471,93 @@ def _check_all_orients( # force everything to have object dtype beforehand _check_orient(df.transpose().transpose(), "index", dtype=False) - def test_frame_from_json_bad_data(self): - with pytest.raises(ValueError, match="Expected object or value"): - read_json(StringIO('{"key":b:a:d}')) - - # too few indices - json = StringIO( - '{"columns":["A","B"],' - '"index":["2","3"],' - '"data":[[1.0,"1"],[2.0,"2"],[null,"3"]]}' - ) - msg = r"Shape of passed values is \(3, 2\), indices imply \(2, 2\)" + @pytest.mark.parametrize( + "data,msg,orient", + [ + ('{"key":b:a:d}', "Expected object or value", "columns"), + # too few indices + ( + '{"columns":["A","B"],' + '"index":["2","3"],' + '"data":[[1.0,"1"],[2.0,"2"],[null,"3"]]}', + r"Shape of passed values is \(3, 2\), indices imply \(2, 2\)", + "split", + ), + # too many columns + ( + '{"columns":["A","B","C"],' + '"index":["1","2","3"],' + '"data":[[1.0,"1"],[2.0,"2"],[null,"3"]]}', + "3 columns passed, passed data had 2 columns", + "split", + ), + # bad key + ( + '{"badkey":["A","B"],' + '"index":["2","3"],' + '"data":[[1.0,"1"],[2.0,"2"],[null,"3"]]}', + r"unexpected key\(s\): badkey", + "split", + ), + ], + ) + def test_frame_from_json_bad_data_raises(self, data, msg, orient): with pytest.raises(ValueError, match=msg): - read_json(json, orient="split") + read_json(StringIO(data), orient=orient) - # too many columns - json = StringIO( - '{"columns":["A","B","C"],' - '"index":["1","2","3"],' - '"data":[[1.0,"1"],[2.0,"2"],[null,"3"]]}' + @pytest.mark.parametrize("dtype", [True, False]) + @pytest.mark.parametrize("convert_axes", [True, False]) + @pytest.mark.parametrize("numpy", [True, False]) + def test_frame_from_json_missing_data(self, orient, convert_axes, numpy, dtype): + num_df = DataFrame([[1, 2], [4, 5, 6]]) + result = read_json( + num_df.to_json(orient=orient), + orient=orient, + convert_axes=convert_axes, + dtype=dtype, ) - msg = "3 columns passed, passed data had 2 columns" - with pytest.raises(ValueError, match=msg): - read_json(json, orient="split") + assert np.isnan(result.iloc[0, 2]) - # bad key - json = StringIO( - '{"badkey":["A","B"],' - '"index":["2","3"],' - '"data":[[1.0,"1"],[2.0,"2"],[null,"3"]]}' + obj_df = DataFrame([["1", "2"], ["4", "5", "6"]]) + result = read_json( + obj_df.to_json(orient=orient), + orient=orient, + convert_axes=convert_axes, + dtype=dtype, ) - with pytest.raises(ValueError, match=r"unexpected key\(s\): badkey"): - read_json(json, orient="split") - - def test_frame_from_json_nones(self): - df = DataFrame([[1, 2], [4, 5, 6]]) - unser = read_json(df.to_json()) - assert np.isnan(unser[2][0]) - - df = DataFrame([["1", "2"], ["4", "5", "6"]]) - unser = read_json(df.to_json()) - assert np.isnan(unser[2][0]) - unser = read_json(df.to_json(), dtype=False) - assert unser[2][0] is None - unser = read_json(df.to_json(), convert_axes=False, dtype=False) - assert unser["2"]["0"] is None - - unser = read_json(df.to_json(), numpy=False) - assert np.isnan(unser[2][0]) - unser = read_json(df.to_json(), numpy=False, dtype=False) - assert unser[2][0] is None - unser = read_json(df.to_json(), numpy=False, convert_axes=False, dtype=False) - assert unser["2"]["0"] is None + if not dtype: # TODO: Special case for object data; maybe a bug? + assert result.iloc[0, 2] is None + else: + assert np.isnan(result.iloc[0, 2]) + @pytest.mark.parametrize("inf", [np.inf, np.NINF]) + @pytest.mark.parametrize("dtype", [True, False]) + def test_frame_infinity(self, orient, inf, dtype): # infinities get mapped to nulls which get mapped to NaNs during # deserialisation df = DataFrame([[1, 2], [4, 5, 6]]) - df.loc[0, 2] = np.inf - unser = read_json(df.to_json()) - assert np.isnan(unser[2][0]) - unser = read_json(df.to_json(), dtype=False) - assert np.isnan(unser[2][0]) - - df.loc[0, 2] = np.NINF - unser = read_json(df.to_json()) - assert np.isnan(unser[2][0]) - unser = read_json(df.to_json(), dtype=False) - assert np.isnan(unser[2][0]) + df.loc[0, 2] = inf + result = read_json(df.to_json(), dtype=dtype) + assert np.isnan(result.iloc[0, 2]) @pytest.mark.skipif( is_platform_32bit(), reason="not compliant on 32-bit, xref #15865" ) - def test_frame_to_json_float_precision(self): - df = pd.DataFrame([dict(a_float=0.95)]) - encoded = df.to_json(double_precision=1) - assert encoded == '{"a_float":{"0":1.0}}' - - df = pd.DataFrame([dict(a_float=1.95)]) - encoded = df.to_json(double_precision=1) - assert encoded == '{"a_float":{"0":2.0}}' - - df = pd.DataFrame([dict(a_float=-1.95)]) - encoded = df.to_json(double_precision=1) - assert encoded == '{"a_float":{"0":-2.0}}' - - df = pd.DataFrame([dict(a_float=0.995)]) - encoded = df.to_json(double_precision=2) - assert encoded == '{"a_float":{"0":1.0}}' - - df = pd.DataFrame([dict(a_float=0.9995)]) - encoded = df.to_json(double_precision=3) - assert encoded == '{"a_float":{"0":1.0}}' - - df = pd.DataFrame([dict(a_float=0.99999999999999944)]) - encoded = df.to_json(double_precision=15) - assert encoded == '{"a_float":{"0":1.0}}' + @pytest.mark.parametrize( + "value,precision,expected_val", + [ + (0.95, 1, 1.0), + (1.95, 1, 2.0), + (-1.95, 1, -2.0), + (0.995, 2, 1.0), + (0.9995, 3, 1.0), + (0.99999999999999944, 15, 1.0), + ], + ) + def test_frame_to_json_float_precision(self, value, precision, expected_val): + df = pd.DataFrame([dict(a_float=value)]) + encoded = df.to_json(double_precision=precision) + assert encoded == '{{"a_float":{{"0":{}}}}}'.format(expected_val) def test_frame_to_json_except(self): df = DataFrame([1, 2, 3]) @@ -799,107 +792,92 @@ def test_series_non_unique_index(self): unser = read_json(s.to_json(orient="records"), orient="records", typ="series") tm.assert_numpy_array_equal(s.values, unser.values) - def test_series_from_json_to_json(self): - def _check_orient( - series, orient, dtype=None, numpy=False, check_index_type=True - ): - series = series.sort_index() - unser = read_json( - series.to_json(orient=orient), - typ="series", - orient=orient, - numpy=numpy, - dtype=dtype, - ) - unser = unser.sort_index() - if orient == "records" or orient == "values": - assert_almost_equal(series.values, unser.values) - else: - if orient == "split": - assert_series_equal( - series, unser, check_index_type=check_index_type - ) - else: - assert_series_equal( - series, - unser, - check_names=False, - check_index_type=check_index_type, - ) - - def _check_all_orients(series, dtype=None, check_index_type=True): - _check_orient( - series, "columns", dtype=dtype, check_index_type=check_index_type - ) - _check_orient( - series, "records", dtype=dtype, check_index_type=check_index_type - ) - _check_orient( - series, "split", dtype=dtype, check_index_type=check_index_type - ) - _check_orient( - series, "index", dtype=dtype, check_index_type=check_index_type - ) - _check_orient(series, "values", dtype=dtype) - - _check_orient( - series, - "columns", - dtype=dtype, - numpy=True, - check_index_type=check_index_type, - ) - _check_orient( - series, - "records", - dtype=dtype, - numpy=True, - check_index_type=check_index_type, - ) - _check_orient( - series, - "split", - dtype=dtype, - numpy=True, - check_index_type=check_index_type, - ) - _check_orient( - series, - "index", - dtype=dtype, - numpy=True, - check_index_type=check_index_type, - ) - _check_orient( - series, - "values", - dtype=dtype, - numpy=True, - check_index_type=check_index_type, - ) - - # basic - _check_all_orients(self.series) + def test_series_default_orient(self): assert self.series.to_json() == self.series.to_json(orient="index") - objSeries = Series( + @pytest.mark.parametrize("numpy", [True, False]) + def test_series_roundtrip_simple(self, orient, numpy): + data = self.series.to_json(orient=orient) + result = pd.read_json(data, typ="series", orient=orient, numpy=numpy) + expected = self.series.copy() + + if not numpy and PY35 and orient in ("index", "columns"): + expected = expected.sort_index() + if orient in ("values", "records"): + expected = expected.reset_index(drop=True) + if orient != "split": + expected.name = None + + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize("dtype", [False, None]) + @pytest.mark.parametrize("numpy", [True, False]) + def test_series_roundtrip_object(self, orient, numpy, dtype): + # TODO: see why tm.makeObjectSeries provides back DTA + dtSeries = Series( [str(d) for d in self.objSeries], index=self.objSeries.index, name=self.objSeries.name, ) - _check_all_orients(objSeries, dtype=False) + data = dtSeries.to_json(orient=orient) + result = pd.read_json( + data, typ="series", orient=orient, numpy=numpy, dtype=dtype + ) + if dtype is False: + expected = dtSeries.copy() + else: + expected = self.objSeries.copy() + + if not numpy and PY35 and orient in ("index", "columns"): + expected = expected.sort_index() + if orient in ("values", "records"): + expected = expected.reset_index(drop=True) + if orient != "split": + expected.name = None + + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize("numpy", [True, False]) + def test_series_roundtrip_empty(self, orient, numpy): + data = self.empty_series.to_json(orient=orient) + result = pd.read_json(data, typ="series", orient=orient, numpy=numpy) + expected = self.empty_series.copy() + + # TODO: see what causes inconsistency + if not numpy and PY35 and orient == "index": + expected = expected.sort_index() + if orient in ("values", "records"): + expected = expected.reset_index(drop=True) + else: + expected.index = expected.index.astype(float) + + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize("numpy", [True, False]) + def test_series_roundtrip_timeseries(self, orient, numpy): + data = self.ts.to_json(orient=orient) + result = pd.read_json(data, typ="series", orient=orient, numpy=numpy) + expected = self.ts.copy() - # empty_series has empty index with object dtype - # which cannot be revert - assert self.empty_series.index.dtype == np.object_ - _check_all_orients(self.empty_series, check_index_type=False) + if orient in ("values", "records"): + expected = expected.reset_index(drop=True) + if orient != "split": + expected.name = None - _check_all_orients(self.ts) + tm.assert_series_equal(result, expected) - # dtype + @pytest.mark.parametrize("dtype", [np.float64, np.int]) + @pytest.mark.parametrize("numpy", [True, False]) + def test_series_roundtrip_numeric(self, orient, numpy, dtype): s = Series(range(6), index=["a", "b", "c", "d", "e", "f"]) - _check_all_orients(Series(s, dtype=np.float64), dtype=np.float64) - _check_all_orients(Series(s, dtype=np.int), dtype=np.int) + data = s.to_json(orient=orient) + result = pd.read_json(data, typ="series", orient=orient, numpy=numpy) + + expected = s.copy() + if orient in ("values", "records"): + expected = expected.reset_index(drop=True) + + tm.assert_series_equal(result, expected) def test_series_to_json_except(self): s = Series([1, 2, 3]) @@ -991,11 +969,9 @@ def test_convert_dates(self): result = read_json(json, typ="series") assert_series_equal(result, ts) - def test_convert_dates_infer(self): - # GH10747 - from pandas.io.json import dumps - - infer_words = [ + @pytest.mark.parametrize( + "infer_word", + [ "trade_time", "date", "datetime", @@ -1003,14 +979,18 @@ def test_convert_dates_infer(self): "modified", "timestamp", "timestamps", - ] - for infer_word in infer_words: - data = [{"id": 1, infer_word: 1036713600000}, {"id": 2}] - expected = DataFrame( - [[1, Timestamp("2002-11-08")], [2, pd.NaT]], columns=["id", infer_word] - ) - result = read_json(dumps(data))[["id", infer_word]] - assert_frame_equal(result, expected) + ], + ) + def test_convert_dates_infer(self, infer_word): + # GH10747 + from pandas.io.json import dumps + + data = [{"id": 1, infer_word: 1036713600000}, {"id": 2}] + expected = DataFrame( + [[1, Timestamp("2002-11-08")], [2, pd.NaT]], columns=["id", infer_word] + ) + result = read_json(dumps(data))[["id", infer_word]] + assert_frame_equal(result, expected) @pytest.mark.parametrize( "date,date_unit", @@ -1315,27 +1295,32 @@ def test_sparse(self): expected = s.to_json() assert expected == ss.to_json() - def test_tz_is_utc(self): + @pytest.mark.parametrize( + "ts", + [ + Timestamp("2013-01-10 05:00:00Z"), + Timestamp("2013-01-10 00:00:00", tz="US/Eastern"), + Timestamp("2013-01-10 00:00:00-0500"), + ], + ) + def test_tz_is_utc(self, ts): from pandas.io.json import dumps exp = '"2013-01-10T05:00:00.000Z"' - ts = Timestamp("2013-01-10 05:00:00Z") - assert dumps(ts, iso_dates=True) == exp - dt = ts.to_pydatetime() - assert dumps(dt, iso_dates=True) == exp - - ts = Timestamp("2013-01-10 00:00:00", tz="US/Eastern") assert dumps(ts, iso_dates=True) == exp dt = ts.to_pydatetime() assert dumps(dt, iso_dates=True) == exp - ts = Timestamp("2013-01-10 00:00:00-0500") - assert dumps(ts, iso_dates=True) == exp - dt = ts.to_pydatetime() - assert dumps(dt, iso_dates=True) == exp - - def test_tz_range_is_utc(self): + @pytest.mark.parametrize( + "tz_range", + [ + pd.date_range("2013-01-01 05:00:00Z", periods=2), + pd.date_range("2013-01-01 00:00:00", periods=2, tz="US/Eastern"), + pd.date_range("2013-01-01 00:00:00-0500", periods=2), + ], + ) + def test_tz_range_is_utc(self, tz_range): from pandas.io.json import dumps exp = '["2013-01-01T05:00:00.000Z","2013-01-02T05:00:00.000Z"]' @@ -1345,7 +1330,6 @@ def test_tz_range_is_utc(self): '"1":"2013-01-02T05:00:00.000Z"}}' ) - tz_range = pd.date_range("2013-01-01 05:00:00Z", periods=2) assert dumps(tz_range, iso_dates=True) == exp dti = pd.DatetimeIndex(tz_range) assert dumps(dti, iso_dates=True) == exp @@ -1353,20 +1337,6 @@ def test_tz_range_is_utc(self): result = dumps(df, iso_dates=True) assert result == dfexp - tz_range = pd.date_range("2013-01-01 00:00:00", periods=2, tz="US/Eastern") - assert dumps(tz_range, iso_dates=True) == exp - dti = pd.DatetimeIndex(tz_range) - assert dumps(dti, iso_dates=True) == exp - df = DataFrame({"DT": dti}) - assert dumps(df, iso_dates=True) == dfexp - - tz_range = pd.date_range("2013-01-01 00:00:00-0500", periods=2) - assert dumps(tz_range, iso_dates=True) == exp - dti = pd.DatetimeIndex(tz_range) - assert dumps(dti, iso_dates=True) == exp - df = DataFrame({"DT": dti}) - assert dumps(df, iso_dates=True) == dfexp - def test_read_inline_jsonl(self): # GH9180 result = read_json('{"a": 1, "b": 2}\n{"b":2, "a" :1}\n', lines=True) @@ -1458,14 +1428,10 @@ def test_latin_encoding(self): [b"A\xf8\xfc", np.nan, b"", b"b", b"c"], ] - def _try_decode(x, encoding="latin-1"): - try: - return x.decode(encoding) - except AttributeError: - return x - - # not sure how to remove latin-1 from code in python 2 and 3 - values = [[_try_decode(x) for x in y] for y in values] + values = [ + [x.decode("latin-1") if isinstance(x, bytes) else x for x in y] + for y in values + ] examples = [] for dtype in ["category", object]: