Add Indent Support in to_json (#28130)

WillAyd · web-flow · commit c94eaeec0387 · 2019-09-18T08:10:06.000-07:00
diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst
@@ -36,7 +36,7 @@ Other enhancements
   when using the ``pyarrow`` engine. It is currently not yet supported when converting back to
   pandas (so it will become an integer or float dtype depending on the presence of missing data).
   (:issue:`28368`)
--
+- :meth:`DataFrame.to_json` now accepts an ``indent`` integer argument to enable pretty printing of JSON output (:issue:`12004`)
 
 
 Build Changes
@@ -217,6 +217,7 @@ I/O
 - Improve infinity parsing. :meth:`read_csv` now interprets ``Infinity``, ``+Infinity``, ``-Infinity`` as floating point values (:issue:`10065`)
 - Bug in :meth:`DataFrame.to_csv` where values were truncated when the length of ``na_rep`` was shorter than the text input data. (:issue:`25099`)
 - Bug in :func:`DataFrame.to_string` where values were truncated using display options instead of outputting the full content (:issue:`9784`)
+- Bug in :meth:`DataFrame.to_json` where a datetime column label would not be written out in ISO format with ``orient="table"`` (:issue:`28130`)
 
 Plotting
 ^^^^^^^^
diff --git a/pandas/_libs/src/ujson/lib/ultrajson.h b/pandas/_libs/src/ujson/lib/ultrajson.h
@@ -244,6 +244,10 @@ typedef struct __JSONObjectEncoder {
   If true, '<', '>', and '&' characters will be encoded as \u003c, \u003e, and \u0026, respectively. If false, no special encoding will be used. */
   int encodeHTMLChars;
 
+  /*
+  Configuration for spaces of indent */
+  int indent;
+
   /*
   Set to an error message if error occurred */
   const char *errorMsg;
diff --git a/pandas/_libs/src/ujson/lib/ultrajsonenc.c b/pandas/_libs/src/ujson/lib/ultrajsonenc.c
@@ -728,6 +728,22 @@ FASTCALL_ATTR INLINE_PREFIX void FASTCALL_MSVC strreverse(char *begin,
     while (end > begin) aux = *end, *end-- = *begin, *begin++ = aux;
 }
 
+void Buffer_AppendIndentNewlineUnchecked(JSONObjectEncoder *enc)
+{
+  if (enc->indent > 0) Buffer_AppendCharUnchecked(enc, '\n');
+}
+
+// This function could be refactored to only accept enc as an argument,
+// but this is a straight vendor from ujson source
+void Buffer_AppendIndentUnchecked(JSONObjectEncoder *enc, JSINT32 value)
+{
+  int i;
+  if (enc->indent > 0)
+    while (value-- > 0)
+      for (i = 0; i < enc->indent; i++)
+        Buffer_AppendCharUnchecked(enc, ' ');
+}
+
 void Buffer_AppendIntUnchecked(JSONObjectEncoder *enc, JSINT32 value) {
     char *wstr;
     JSUINT32 uvalue = (value < 0) ? -value : value;
@@ -960,24 +976,28 @@ void encode(JSOBJ obj, JSONObjectEncoder *enc, const char *name,
             enc->iterBegin(obj, &tc);
 
             Buffer_AppendCharUnchecked(enc, '[');
+            Buffer_AppendIndentNewlineUnchecked (enc);
 
             while (enc->iterNext(obj, &tc)) {
                 if (count > 0) {
                     Buffer_AppendCharUnchecked(enc, ',');
 #ifndef JSON_NO_EXTRA_WHITESPACE
                     Buffer_AppendCharUnchecked(buffer, ' ');
 #endif
+                    Buffer_AppendIndentNewlineUnchecked (enc);
                 }
 
                 iterObj = enc->iterGetValue(obj, &tc);
 
                 enc->level++;
+                Buffer_AppendIndentUnchecked (enc, enc->level);
                 encode(iterObj, enc, NULL, 0);
                 count++;
             }
 
             enc->iterEnd(obj, &tc);
-            Buffer_Reserve(enc, 2);
+            Buffer_AppendIndentNewlineUnchecked (enc);
+            Buffer_AppendIndentUnchecked (enc, enc->level);
             Buffer_AppendCharUnchecked(enc, ']');
             break;
         }
@@ -987,25 +1007,29 @@ void encode(JSOBJ obj, JSONObjectEncoder *enc, const char *name,
             enc->iterBegin(obj, &tc);
 
             Buffer_AppendCharUnchecked(enc, '{');
+            Buffer_AppendIndentNewlineUnchecked (enc);
 
             while (enc->iterNext(obj, &tc)) {
                 if (count > 0) {
                     Buffer_AppendCharUnchecked(enc, ',');
 #ifndef JSON_NO_EXTRA_WHITESPACE
                     Buffer_AppendCharUnchecked(enc, ' ');
 #endif
+                    Buffer_AppendIndentNewlineUnchecked (enc);
                 }
 
                 iterObj = enc->iterGetValue(obj, &tc);
                 objName = enc->iterGetName(obj, &tc, &szlen);
 
                 enc->level++;
+                Buffer_AppendIndentUnchecked (enc, enc->level);
                 encode(iterObj, enc, objName, szlen);
                 count++;
             }
 
             enc->iterEnd(obj, &tc);
-            Buffer_Reserve(enc, 2);
+            Buffer_AppendIndentNewlineUnchecked (enc);
+            Buffer_AppendIndentUnchecked (enc, enc->level);
             Buffer_AppendCharUnchecked(enc, '}');
             break;
         }
diff --git a/pandas/_libs/src/ujson/python/objToJSON.c b/pandas/_libs/src/ujson/python/objToJSON.c
@@ -2373,10 +2373,16 @@ char *Object_iterGetName(JSOBJ obj, JSONTypeContext *tc, size_t *outLen) {
 }
 
 PyObject *objToJSON(PyObject *self, PyObject *args, PyObject *kwargs) {
-    static char *kwlist[] = {
-        "obj",    "ensure_ascii", "double_precision", "encode_html_chars",
-        "orient", "date_unit",    "iso_dates",        "default_handler",
-        NULL};
+    static char *kwlist[] = {"obj",
+                             "ensure_ascii",
+                             "double_precision",
+                             "encode_html_chars",
+                             "orient",
+                             "date_unit",
+                             "iso_dates",
+                             "default_handler",
+                             "indent",
+                             NULL};
 
     char buffer[65536];
     char *ret;
@@ -2389,6 +2395,7 @@ PyObject *objToJSON(PyObject *self, PyObject *args, PyObject *kwargs) {
     char *sdateFormat = NULL;
     PyObject *oisoDates = 0;
     PyObject *odefHandler = 0;
+    int indent = 0;
 
     PyObjectEncoder pyEncoder = {{
         Object_beginTypeContext,
@@ -2410,6 +2417,7 @@ PyObject *objToJSON(PyObject *self, PyObject *args, PyObject *kwargs) {
         idoublePrecision,
         1, // forceAscii
         0, // encodeHTMLChars
+        0, // indent
     }};
     JSONObjectEncoder *encoder = (JSONObjectEncoder *)&pyEncoder;
 
@@ -2434,10 +2442,10 @@ PyObject *objToJSON(PyObject *self, PyObject *args, PyObject *kwargs) {
 
     PRINTMARK();
 
-    if (!PyArg_ParseTupleAndKeywords(args, kwargs, "O|OiOssOO", kwlist, &oinput,
-                                     &oensureAscii, &idoublePrecision,
+    if (!PyArg_ParseTupleAndKeywords(args, kwargs, "O|OiOssOOi", kwlist,
+                                     &oinput, &oensureAscii, &idoublePrecision,
                                      &oencodeHTMLChars, &sOrient, &sdateFormat,
-                                     &oisoDates, &odefHandler)) {
+                                     &oisoDates, &odefHandler, &indent)) {
         return NULL;
     }
 
@@ -2503,6 +2511,8 @@ PyObject *objToJSON(PyObject *self, PyObject *args, PyObject *kwargs) {
         pyEncoder.defaultHandler = odefHandler;
     }
 
+    encoder->indent = indent;
+
     pyEncoder.originalOutputFormat = pyEncoder.outputFormat;
     PRINTMARK();
     ret = JSON_EncodeObject(oinput, encoder, buffer, sizeof(buffer));
diff --git a/pandas/_typing.py b/pandas/_typing.py
@@ -22,7 +22,7 @@
 FilePathOrBuffer = Union[str, Path, IO[AnyStr]]
 
 FrameOrSeries = TypeVar("FrameOrSeries", bound="NDFrame")
-Scalar = Union[str, int, float]
+Scalar = Union[str, int, float, bool]
 Axis = Union[str, int]
 Ordered = Optional[bool]
 
diff --git a/pandas/core/generic.py b/pandas/core/generic.py
@@ -8,6 +8,7 @@
 import re
 from textwrap import dedent
 from typing import (
+    Any,
     Callable,
     Dict,
     FrozenSet,
@@ -60,7 +61,7 @@
 from pandas.core.dtypes.missing import isna, notna
 
 import pandas as pd
-from pandas._typing import Dtype, FilePathOrBuffer
+from pandas._typing import Dtype, FilePathOrBuffer, Scalar
 from pandas.core import missing, nanops
 import pandas.core.algorithms as algos
 from pandas.core.base import PandasObject, SelectionMixin
@@ -2245,17 +2246,18 @@ def to_excel(
 
     def to_json(
         self,
-        path_or_buf=None,
-        orient=None,
-        date_format=None,
-        double_precision=10,
-        force_ascii=True,
-        date_unit="ms",
-        default_handler=None,
-        lines=False,
-        compression="infer",
-        index=True,
-    ):
+        path_or_buf: Optional[FilePathOrBuffer] = None,
+        orient: Optional[str] = None,
+        date_format: Optional[str] = None,
+        double_precision: int = 10,
+        force_ascii: bool_t = True,
+        date_unit: str = "ms",
+        default_handler: Optional[Callable[[Any], Union[Scalar, List, Dict]]] = None,
+        lines: bool_t = False,
+        compression: Optional[str] = "infer",
+        index: bool_t = True,
+        indent: Optional[int] = None,
+    ) -> Optional[str]:
         """
         Convert the object to a JSON string.
 
@@ -2335,6 +2337,11 @@ def to_json(
 
             .. versionadded:: 0.23.0
 
+        indent : integer, optional
+           Length of whitespace used to indent each record.
+
+           .. versionadded:: 1.0.0
+
         Returns
         -------
         None or str
@@ -2345,6 +2352,13 @@ def to_json(
         --------
         read_json
 
+        Notes
+        -----
+        The behavior of ``indent=0`` varies from the stdlib, which does not
+        indent the output but does insert newlines. Currently, ``indent=0``
+        and the default ``indent=None`` are equivalent in pandas, though this
+        may change in a future release.
+
         Examples
         --------
 
@@ -2395,6 +2409,10 @@ def to_json(
             date_format = "iso"
         elif date_format is None:
             date_format = "epoch"
+
+        config.is_nonnegative_int(indent)
+        indent = indent or 0
+
         return json.to_json(
             path_or_buf=path_or_buf,
             obj=self,
@@ -2407,6 +2425,7 @@ def to_json(
             lines=lines,
             compression=compression,
             index=index,
+            indent=indent,
         )
 
     def to_hdf(self, path_or_buf, key, **kwargs):
diff --git a/pandas/io/json/_json.py b/pandas/io/json/_json.py
diff --git a/pandas/tests/io/json/test_json_table_schema.py b/pandas/tests/io/json/test_json_table_schema.py
diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py

-Original file line number
+Diff line change
@@ @@ -1,6 +1,8 @@ @@
 +from collections import OrderedDict
 from io import StringIO
 from itertools import islice
 import os
 +from typing import Any, Callable, Dict, List, Optional, Type, Union
 import numpy as np
 from pandas.core.dtypes.common import ensure_str, is_period_dtype
 from pandas import DataFrame, MultiIndex, Series, isna, to_datetime
 +from pandas._typing import Scalar
 from pandas.core.reshape.concat import concat
 from pandas.io.common import (
 TABLE_SCHEMA_VERSION = "0.20.0"
 +Serializable = Union[Scalar, List, Dict]
++
 # interface to/from
 def to_json(
     path_or_buf,
     obj,
 -    orient=None,
 -    date_format="epoch",
 -    double_precision=10,
 -    force_ascii=True,
 -    date_unit="ms",
 -    default_handler=None,
 -    lines=False,
 -    compression="infer",
 -    index=True,
 +    orient: Optional[str] = None,
 +    date_format: str = "epoch",
 +    double_precision: int = 10,
 +    force_ascii: bool = True,
 +    date_unit: str = "ms",
 +    default_handler: Optional[Callable[[Any], Serializable]] = None,
 +    lines: bool = False,
 +    compression: Optional[str] = "infer",
 +    index: bool = True,
 +    indent: int = 0,
 ):
     if not index and orient not in ["split", "table"]:
     if orient == "table" and isinstance(obj, Series):
         obj = obj.to_frame(name=obj.name or "values")
     if orient == "table" and isinstance(obj, DataFrame):
 -        writer = JSONTableWriter
 +        writer = JSONTableWriter  # type: Type["Writer"]
     elif isinstance(obj, Series):
         writer = SeriesWriter
     elif isinstance(obj, DataFrame):
         date_unit=date_unit,
         default_handler=default_handler,
         index=index,
 +        indent=indent,
     ).write()
     if lines:
     def __init__(
         self,
         obj,
 -        orient,
 -        date_format,
 -        double_precision,
 -        ensure_ascii,
 -        date_unit,
 -        index,
 -        default_handler=None,
 +        orient: Optional[str],
 +        date_format: str,
 +        double_precision: int,
 +        ensure_ascii: bool,
 +        date_unit: str,
 +        index: bool,
 +        default_handler: Optional[Callable[[Any], Serializable]] = None,
 +        indent: int = 0,
     ):
         self.obj = obj
         if orient is None:
 -            orient = self._default_orient
 +            orient = self._default_orient  # type: ignore
         self.orient = orient
         self.date_format = date_format
         self.date_unit = date_unit
         self.default_handler = default_handler
         self.index = index
 +        self.indent = indent
         self.is_copy = None
         self._format_axes()
             self.date_unit,
             self.date_format == "iso",
             self.default_handler,
 +            self.indent,
+        )
     def _write(
         self,
         obj,
 -        orient,
 -        double_precision,
 -        ensure_ascii,
 -        date_unit,
 -        iso_dates,
 -        default_handler,
 +        orient: Optional[str],
 +        double_precision: int,
 +        ensure_ascii: bool,
 +        date_unit: str,
 +        iso_dates: bool,
 +        default_handler: Optional[Callable[[Any], Serializable]],
 +        indent: int,
     ):
         return dumps(
             obj,
             date_unit=date_unit,
             iso_dates=iso_dates,
             default_handler=default_handler,
 +            indent=indent,
+        )
     def _write(
         self,
         obj,
 -        orient,
 -        double_precision,
 -        ensure_ascii,
 -        date_unit,
 -        iso_dates,
 -        default_handler,
 +        orient: Optional[str],
 +        double_precision: int,
 +        ensure_ascii: bool,
 +        date_unit: str,
 +        iso_dates: bool,
 +        default_handler: Optional[Callable[[Any], Serializable]],
 +        indent: int,
     ):
         if not self.index and orient == "split":
             obj = {"name": obj.name, "data": obj.values}
             date_unit,
             iso_dates,
             default_handler,
 +            indent,
+        )
     def _write(
         self,
         obj,
 -        orient,
 -        double_precision,
 -        ensure_ascii,
 -        date_unit,
 -        iso_dates,
 -        default_handler,
 +        orient: Optional[str],
 +        double_precision: int,
 +        ensure_ascii: bool,
 +        date_unit: str,
 +        iso_dates: bool,
 +        default_handler: Optional[Callable[[Any], Serializable]],
 +        indent: int,
     ):
         if not self.index and orient == "split":
             obj = obj.to_dict(orient="split")
             date_unit,
             iso_dates,
             default_handler,
 +            indent,
+        )
     def __init__(
         self,
         obj,
 -        orient,
 -        date_format,
 -        double_precision,
 -        ensure_ascii,
 -        date_unit,
 -        index,
 -        default_handler=None,
 +        orient: Optional[str],
 +        date_format: str,
 +        double_precision: int,
 +        ensure_ascii: bool,
 +        date_unit: str,
 +        index: bool,
 +        default_handler: Optional[Callable[[Any], Serializable]] = None,
 +        indent: int = 0,
     ):
         """
         Adds a `schema` attribute with the Table Schema, resets
         the index (can't do in caller, because the schema inference needs
         to know what the index is, forces orient to records, and forces
         date_format to 'iso'.
         """
++
         super().__init__(
             obj,
             orient,
             date_unit,
             index,
             default_handler=default_handler,
 +            indent=indent,
+        )
         if date_format != "iso":
         date_unit,
         iso_dates,
         default_handler,
 +        indent,
     ):
 -        data = super()._write(
 -            obj,
 +        table_obj = OrderedDict((("schema", self.schema), ("data", obj)))
 +        serialized = super()._write(
 +            table_obj,
             orient,
             double_precision,
             ensure_ascii,
             date_unit,
             iso_dates,
             default_handler,
 +            indent,
+        )
 -        serialized = '{{"schema": {schema}, "data": {data}}}'.format(
 -            schema=dumps(self.schema), data=data
 -        )
++
         return serialized
-Original file line number
+Diff line change
 import numpy as np
 import pytest
 +from pandas.compat import PY35
++
 from pandas.core.dtypes.dtypes import CategoricalDtype, DatetimeTZDtype, PeriodDtype
 import pandas as pd
+)
 +def assert_results_equal(result, expected):
 +    """Helper function for comparing deserialized JSON with Py35 compat."""
 +    if PY35:
 +        assert sorted(result.items()) == sorted(expected.items())
 +    else:
 +        assert result == expected
++
++
 class TestBuildSchema:
     def setup_method(self, method):
         self.df = DataFrame(
                 ),
+            ]
+        )
 -        assert result == expected
++
 +        assert_results_equal(result, expected)
     def test_to_json(self):
         df = self.df.copy()
             ),
+        ]
         expected = OrderedDict([("schema", schema), ("data", data)])
 -        assert result == expected
++
 +        assert_results_equal(result, expected)
     def test_to_json_float_index(self):
         data = pd.Series(1, index=[1.0, 2.0])
                 ),
+            ]
+        )
 -        assert result == expected
++
 +        assert_results_equal(result, expected)
     def test_to_json_period_index(self):
         idx = pd.period_range("2016", freq="Q-JAN", periods=2)
             OrderedDict([("index", "2016-02-01T00:00:00.000Z"), ("values", 1)]),
+        ]
         expected = OrderedDict([("schema", schema), ("data", data)])
 -        assert result == expected
++
 +        assert_results_equal(result, expected)
     def test_to_json_categorical_index(self):
         data = pd.Series(1, pd.CategoricalIndex(["a", "b"]))
                 ),
+            ]
+        )
 -        assert result == expected
++
 +        assert_results_equal(result, expected)
     def test_date_format_raises(self):
         with pytest.raises(ValueError):
                 ),
+            ]
+        )
 -        assert result == expected
++
 +        assert_results_equal(result, expected)
     @pytest.mark.parametrize(
         "idx,nm,prop",
+        )
         result = df.to_json(orient="table")
         js = json.loads(result)
 -        assert js["schema"]["fields"][1]["name"] == 1451606400000
 +        assert js["schema"]["fields"][1]["name"] == "2016-01-01T00:00:00.000Z"
 +        # TODO - below expectation is not correct; see GH 28256
         assert js["schema"]["fields"][2]["name"] == 10000
     @pytest.mark.parametrize(
-Original file line number
+Diff line change
         df = pd.DataFrame([[1]], index=[("a", "b")], columns=[("c", "d")])
         result = df.to_json(orient=orient)
         assert result == expected
++
 +    @pytest.mark.parametrize("indent", [1, 2, 4])
 +    def test_to_json_indent(self, indent):
 +        # GH 12004
 +        df = pd.DataFrame([["foo", "bar"], ["baz", "qux"]], columns=["a", "b"])
++
 +        result = df.to_json(indent=indent)
 +        spaces = " " * indent
 +        expected = """{{
 +{spaces}"a":{{
 +{spaces}{spaces}"0":"foo",
 +{spaces}{spaces}"1":"baz"
 +{spaces}}},
 +{spaces}"b":{{
 +{spaces}{spaces}"0":"bar",
 +{spaces}{spaces}"1":"qux"
 +{spaces}}}
 +}}""".format(
 +            spaces=spaces
 +        )
++
 +        assert result == expected
++
 +    @pytest.mark.parametrize(
 +        "orient,expected",
 +        [
 +            (
 +                "split",
 +                """{
 +    "columns":[
 +        "a",
 +        "b"
 +    ],
 +    "index":[
 +        0,
 +        1
 +    ],
 +    "data":[
 +        [
 +            "foo",
 +            "bar"
 +        ],
 +        [
 +            "baz",
 +            "qux"
 +        ]
 +    ]
 +}""",
 +            ),
 +            (
 +                "records",
 +                """[
 +    {
 +        "a":"foo",
 +        "b":"bar"
 +    },
 +    {
 +        "a":"baz",
 +        "b":"qux"
 +    }
 +]""",
 +            ),
 +            (
 +                "index",
 +                """{
 +    "0":{
 +        "a":"foo",
 +        "b":"bar"
 +    },
 +    "1":{
 +        "a":"baz",
 +        "b":"qux"
 +    }
 +}""",
 +            ),
 +            (
 +                "columns",
 +                """{
 +    "a":{
 +        "0":"foo",
 +        "1":"baz"
 +    },
 +    "b":{
 +        "0":"bar",
 +        "1":"qux"
 +    }
 +}""",
 +            ),
 +            (
 +                "values",
 +                """[
 +    [
 +        "foo",
 +        "bar"
 +    ],
 +    [
 +        "baz",
 +        "qux"
 +    ]
 +]""",
 +            ),
 +            (
 +                "table",
 +                """{
 +    "schema":{
 +        "fields":[
 +            {
 +                "name":"index",
 +                "type":"integer"
 +            },
 +            {
 +                "name":"a",
 +                "type":"string"
 +            },
 +            {
 +                "name":"b",
 +                "type":"string"
 +            }
 +        ],
 +        "primaryKey":[
 +            "index"
 +        ],
 +        "pandas_version":"0.20.0"
 +    },
 +    "data":[
 +        {
 +            "index":0,
 +            "a":"foo",
 +            "b":"bar"
 +        },
 +        {
 +            "index":1,
 +            "a":"baz",
 +            "b":"qux"
 +        }
 +    ]
 +}""",
 +            ),
 +        ],
 +    )
 +    def test_json_indent_all_orients(self, orient, expected):
 +        # GH 12004
 +        df = pd.DataFrame([["foo", "bar"], ["baz", "qux"]], columns=["a", "b"])
 +        result = df.to_json(orient=orient, indent=4)
++
 +        if PY35:
 +            assert json.loads(result) == json.loads(expected)
 +        else:
 +            assert result == expected
++
 +    def test_json_negative_indent_raises(self):
 +        with pytest.raises(ValueError, match="must be a nonnegative integer"):
 +            pd.DataFrame().to_json(indent=-1)