pandas-dev · DavideCanton · Jan 26, 2023 · Jan 26, 2023 · Jul 3, 2023 · Jul 15, 2023
diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst
@@ -1859,7 +1859,7 @@ with optional parameters:
 * ``date_format`` : string, type of date conversion, 'epoch' for timestamp, 'iso' for ISO8601.
 * ``double_precision`` : The number of decimal places to use when encoding floating point values, default 10.
 * ``force_ascii`` : force encoded string to be ASCII, default True.
-* ``date_unit`` : The time unit to encode to, governs timestamp and ISO8601 precision. One of 's', 'ms', 'us' or 'ns' for seconds, milliseconds, microseconds and nanoseconds respectively. Default 'ms'.
+* ``date_unit`` : The time unit to encode to, governs timestamp and ISO8601 precision. One of 'D', 's', 'ms', 'us' or 'ns' for days, seconds, milliseconds, microseconds and nanoseconds respectively. Default 'ms'.
 * ``default_handler`` : The handler to call if an object cannot otherwise be converted to a suitable format for JSON. Takes a single argument, which is the object to convert, and returns a serializable object.
 * ``lines`` : If ``records`` orient, then will write each record per line as json.
 * ``mode`` : string, writer mode when writing to path. 'w' for write, 'a' for append. Default 'w'
@@ -1959,6 +1959,13 @@ Writing in ISO date format, with microseconds:
    json = dfd.to_json(date_format="iso", date_unit="us")
    json
 
+Writing in ISO date format, with just dates:
+
+.. ipython:: python
+
+   json = dfd.to_json(date_format="iso", date_unit="D")
+   json
+
 Epoch timestamps, in seconds:
 
 .. ipython:: python
@@ -2059,8 +2066,8 @@ is ``None``. To explicitly force ``Series`` parsing, pass ``typ=series``
 * ``precise_float`` : boolean, default ``False``. Set to enable usage of higher precision (strtod) function when decoding string to double values. Default (``False``) is to use fast but less precise builtin functionality.
 * ``date_unit`` : string, the timestamp unit to detect if converting dates. Default
   None. By default the timestamp precision will be detected, if this is not desired
-  then pass one of 's', 'ms', 'us' or 'ns' to force timestamp precision to
-  seconds, milliseconds, microseconds or nanoseconds respectively.
+  then pass one of 'D', 's', 'ms', 'us' or 'ns' to force timestamp precision to
+  days, seconds, milliseconds, microseconds or nanoseconds respectively.
 * ``lines`` : reads file as one json object per line.
 * ``encoding`` : The encoding to use to decode py3 bytes.
 * ``chunksize`` : when used in combination with ``lines=True``, return a ``pandas.api.typing.JsonReader`` which reads in ``chunksize`` lines per iteration.

diff --git a/pandas/_libs/src/datetime/date_conversions.c b/pandas/_libs/src/datetime/date_conversions.c
@@ -33,6 +33,10 @@ int scaleNanosecToUnit(npy_int64 *value, NPY_DATETIMEUNIT unit) {
     case NPY_FR_s:
         *value /= 1000000000LL;
         break;
+    case NPY_FR_D:
+        // 24 * 60 * 60 * 1000000000LL to convert to days
+        *value /= 86400000000000LL;
+        break;
     default:
         return -1;
     }

diff --git a/pandas/_libs/src/vendored/ujson/python/objToJSON.c b/pandas/_libs/src/vendored/ujson/python/objToJSON.c
@@ -2089,6 +2089,8 @@ PyObject *objToJSON(PyObject *Py_UNUSED(self), PyObject *args,
             pyEncoder.datetimeUnit = NPY_FR_us;
         } else if (strcmp(sdateFormat, "ns") == 0) {
             pyEncoder.datetimeUnit = NPY_FR_ns;
+        } else if (strcmp(sdateFormat, "D") == 0) {
+            pyEncoder.datetimeUnit = NPY_FR_D;
         } else {
             PyErr_Format(PyExc_ValueError,
                          "Invalid value '%s' for option 'date_unit'",

diff --git a/pandas/core/generic.py b/pandas/core/generic.py
@@ -2426,8 +2426,8 @@ def to_json(
             Force encoded string to be ASCII.
         date_unit : str, default 'ms' (milliseconds)
             The time unit to encode to, governs timestamp and ISO8601
-            precision.  One of 's', 'ms', 'us', 'ns' for second, millisecond,
-            microsecond, and nanosecond respectively.
+            precision.  One of 'D', 's', 'ms', 'us', 'ns' for days,
+            second, millisecond, microsecond, and nanosecond respectively.
         default_handler : callable, default None
             Handler to call if object cannot otherwise be converted to a
             suitable format for JSON. Should receive a single argument which is

diff --git a/pandas/io/json/_json.py b/pandas/io/json/_json.py
@@ -622,8 +622,10 @@ def read_json(
     date_unit : str, default None
         The timestamp unit to detect if converting dates. The default behaviour
         is to try and detect the correct precision, but if this is not desired
-        then pass one of 's', 'ms', 'us' or 'ns' to force parsing only seconds,
-        milliseconds, microseconds or nanoseconds respectively.
+        then pass one of 'D', 's', 'ms', 'us' or 'ns' to force parsing only days,
+        seconds, milliseconds, microseconds or nanoseconds respectively.
+        Time unit (from 's' to 'ns') are automatically inferred from the input,
+        while the 'D' unit is not inferred and must be specified explicitly.
 
     encoding : str, default is 'utf-8'
         The encoding to use to decode py3 bytes.
@@ -1115,12 +1117,13 @@ class Parser:
     _split_keys: tuple[str, ...]
     _default_orient: str
 
-    _STAMP_UNITS = ("s", "ms", "us", "ns")
+    _STAMP_UNITS = ("s", "ms", "us", "ns", "D")
     _MIN_STAMPS = {
         "s": 31536000,
         "ms": 31536000000,
         "us": 31536000000000,
         "ns": 31536000000000000,
+        "D": 365,
     }
 
     def __init__(
@@ -1145,7 +1148,9 @@ def __init__(
         self.dtype = dtype
 
         if date_unit is not None:
-            date_unit = date_unit.lower()
+            # avoid lowercasing "D" but ensure retrocompatibility for other units
+            if date_unit != "D":
+                date_unit = date_unit.lower()
             if date_unit not in self._STAMP_UNITS:
                 raise ValueError(f"date_unit must be one of {self._STAMP_UNITS}")
             self.min_stamp = self._MIN_STAMPS[date_unit]

diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py
@@ -883,6 +883,7 @@ def test_convert_dates_infer(self, infer_word):
             ("20130101 20:43:42.123", "ms"),
             ("20130101 20:43:42.123456", "us"),
             ("20130101 20:43:42.123456789", "ns"),
+            ("20130101", "D"),
         ],
     )
     def test_date_format_frame(self, date, date_unit, datetime_frame):
@@ -914,6 +915,7 @@ def test_date_format_frame_raises(self, datetime_frame):
             ("20130101 20:43:42.123", "ms"),
             ("20130101 20:43:42.123456", "us"),
             ("20130101 20:43:42.123456789", "ns"),
+            ("20130101", "D"),
         ],
     )
     def test_date_format_series(self, date, date_unit, datetime_series):
@@ -936,7 +938,7 @@ def test_date_format_series_raises(self, datetime_series):
             ts.to_json(date_format="iso", date_unit="foo")
 
     @pytest.mark.parametrize("unit", ["s", "ms", "us", "ns"])
-    def test_date_unit(self, unit, datetime_frame):
+    def test_date_unit_time(self, unit, datetime_frame):
         df = datetime_frame
         df["date"] = Timestamp("20130101 20:43:42").as_unit("ns")
         dl = df.columns.get_loc("date")
@@ -954,6 +956,37 @@ def test_date_unit(self, unit, datetime_frame):
         result = read_json(StringIO(json), date_unit=None)
         tm.assert_frame_equal(result, df)
 
+    def test_date_unit_day(self, datetime_frame: DataFrame):
+        # a different test is implemented for unit="D"
+        # since it needs some handling of the df, because unit
+        # is not autodetected by the read_json method
+        df = datetime_frame
+        df["date"] = Timestamp("20130102 20:43:42").as_unit("ns")
+        dl = df.columns.get_loc("date")
+
+        df.iloc[1, dl] = Timestamp("19710102 20:43:42")
+        df.iloc[2, dl] = Timestamp("21460101 20:43:42")
+        df.iloc[4, dl] = pd.NaT
+
+        jsonStr = df.to_json(date_format="epoch", date_unit="D")
+
+        # remove time part since it doesn't get serialized
+        # so it won't be equal in the deserialized df
+        df["date"] = pd.to_datetime(df["date"].dt.date)
+
+        parsed = json.loads(jsonStr)
+        jsonDate = Series(parsed["date"].values(), index=df.index)
+        dfDate = (
+            df["date"].map(lambda d: d.timestamp() if not pd.isna(d) else None)
+            / 60
+            / 60
+            / 24
+        )
+        tm.assert_series_equal(jsonDate, dfDate, check_names=False)
+
+        result = read_json(StringIO(jsonStr), date_unit="D")
+        tm.assert_frame_equal(result, df)
+
     def test_weird_nested_json(self):
         # this used to core dump the parser
         s = r"""{

diff --git a/pandas/tests/io/json/test_ujson.py b/pandas/tests/io/json/test_ujson.py
@@ -394,6 +394,9 @@ def test_datetime_units(self):
         val = datetime.datetime(2013, 8, 17, 21, 17, 12, 215504)
         stamp = Timestamp(val).as_unit("ns")
 
+        roundtrip = ujson.decode(ujson.encode(val, date_unit="D"))
+        assert roundtrip == stamp.value // (864 * 10**11)
+
         roundtrip = ujson.decode(ujson.encode(val, date_unit="s"))
         assert roundtrip == stamp._value // 10**9