diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index daf4c7b54331b..349aa4a09a3ad 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -1859,7 +1859,7 @@ with optional parameters: * ``date_format`` : string, type of date conversion, 'epoch' for timestamp, 'iso' for ISO8601. * ``double_precision`` : The number of decimal places to use when encoding floating point values, default 10. * ``force_ascii`` : force encoded string to be ASCII, default True. -* ``date_unit`` : The time unit to encode to, governs timestamp and ISO8601 precision. One of 's', 'ms', 'us' or 'ns' for seconds, milliseconds, microseconds and nanoseconds respectively. Default 'ms'. +* ``date_unit`` : The time unit to encode to, governs timestamp and ISO8601 precision. One of 'D', 's', 'ms', 'us' or 'ns' for days, seconds, milliseconds, microseconds and nanoseconds respectively. Default 'ms'. * ``default_handler`` : The handler to call if an object cannot otherwise be converted to a suitable format for JSON. Takes a single argument, which is the object to convert, and returns a serializable object. * ``lines`` : If ``records`` orient, then will write each record per line as json. * ``mode`` : string, writer mode when writing to path. 'w' for write, 'a' for append. Default 'w' @@ -1959,6 +1959,13 @@ Writing in ISO date format, with microseconds: json = dfd.to_json(date_format="iso", date_unit="us") json +Writing in ISO date format, with just dates: + +.. ipython:: python + + json = dfd.to_json(date_format="iso", date_unit="D") + json + Epoch timestamps, in seconds: .. ipython:: python @@ -2059,8 +2066,8 @@ is ``None``. To explicitly force ``Series`` parsing, pass ``typ=series`` * ``precise_float`` : boolean, default ``False``. Set to enable usage of higher precision (strtod) function when decoding string to double values. Default (``False``) is to use fast but less precise builtin functionality. * ``date_unit`` : string, the timestamp unit to detect if converting dates. Default None. By default the timestamp precision will be detected, if this is not desired - then pass one of 's', 'ms', 'us' or 'ns' to force timestamp precision to - seconds, milliseconds, microseconds or nanoseconds respectively. + then pass one of 'D', 's', 'ms', 'us' or 'ns' to force timestamp precision to + days, seconds, milliseconds, microseconds or nanoseconds respectively. * ``lines`` : reads file as one json object per line. * ``encoding`` : The encoding to use to decode py3 bytes. * ``chunksize`` : when used in combination with ``lines=True``, return a ``pandas.api.typing.JsonReader`` which reads in ``chunksize`` lines per iteration. diff --git a/pandas/_libs/src/datetime/date_conversions.c b/pandas/_libs/src/datetime/date_conversions.c index 84fc5507010ed..7184fe43d01d3 100644 --- a/pandas/_libs/src/datetime/date_conversions.c +++ b/pandas/_libs/src/datetime/date_conversions.c @@ -33,6 +33,10 @@ int scaleNanosecToUnit(npy_int64 *value, NPY_DATETIMEUNIT unit) { case NPY_FR_s: *value /= 1000000000LL; break; + case NPY_FR_D: + // 24 * 60 * 60 * 1000000000LL to convert to days + *value /= 86400000000000LL; + break; default: return -1; } diff --git a/pandas/_libs/src/vendored/ujson/python/objToJSON.c b/pandas/_libs/src/vendored/ujson/python/objToJSON.c index 65b468f268d75..9105b7b1a2fae 100644 --- a/pandas/_libs/src/vendored/ujson/python/objToJSON.c +++ b/pandas/_libs/src/vendored/ujson/python/objToJSON.c @@ -2089,6 +2089,8 @@ PyObject *objToJSON(PyObject *Py_UNUSED(self), PyObject *args, pyEncoder.datetimeUnit = NPY_FR_us; } else if (strcmp(sdateFormat, "ns") == 0) { pyEncoder.datetimeUnit = NPY_FR_ns; + } else if (strcmp(sdateFormat, "D") == 0) { + pyEncoder.datetimeUnit = NPY_FR_D; } else { PyErr_Format(PyExc_ValueError, "Invalid value '%s' for option 'date_unit'", diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 621b31c913c68..38c125556ff03 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -2426,8 +2426,8 @@ def to_json( Force encoded string to be ASCII. date_unit : str, default 'ms' (milliseconds) The time unit to encode to, governs timestamp and ISO8601 - precision. One of 's', 'ms', 'us', 'ns' for second, millisecond, - microsecond, and nanosecond respectively. + precision. One of 'D', 's', 'ms', 'us', 'ns' for days, + second, millisecond, microsecond, and nanosecond respectively. default_handler : callable, default None Handler to call if object cannot otherwise be converted to a suitable format for JSON. Should receive a single argument which is diff --git a/pandas/io/json/_json.py b/pandas/io/json/_json.py index ec0469a393873..ff11b05b8fa1e 100644 --- a/pandas/io/json/_json.py +++ b/pandas/io/json/_json.py @@ -622,8 +622,10 @@ def read_json( date_unit : str, default None The timestamp unit to detect if converting dates. The default behaviour is to try and detect the correct precision, but if this is not desired - then pass one of 's', 'ms', 'us' or 'ns' to force parsing only seconds, - milliseconds, microseconds or nanoseconds respectively. + then pass one of 'D', 's', 'ms', 'us' or 'ns' to force parsing only days, + seconds, milliseconds, microseconds or nanoseconds respectively. + Time unit (from 's' to 'ns') are automatically inferred from the input, + while the 'D' unit is not inferred and must be specified explicitly. encoding : str, default is 'utf-8' The encoding to use to decode py3 bytes. @@ -1115,12 +1117,13 @@ class Parser: _split_keys: tuple[str, ...] _default_orient: str - _STAMP_UNITS = ("s", "ms", "us", "ns") + _STAMP_UNITS = ("s", "ms", "us", "ns", "D") _MIN_STAMPS = { "s": 31536000, "ms": 31536000000, "us": 31536000000000, "ns": 31536000000000000, + "D": 365, } def __init__( @@ -1145,7 +1148,9 @@ def __init__( self.dtype = dtype if date_unit is not None: - date_unit = date_unit.lower() + # avoid lowercasing "D" but ensure retrocompatibility for other units + if date_unit != "D": + date_unit = date_unit.lower() if date_unit not in self._STAMP_UNITS: raise ValueError(f"date_unit must be one of {self._STAMP_UNITS}") self.min_stamp = self._MIN_STAMPS[date_unit] diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index 11909bf56f05c..4c85f3379d204 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -883,6 +883,7 @@ def test_convert_dates_infer(self, infer_word): ("20130101 20:43:42.123", "ms"), ("20130101 20:43:42.123456", "us"), ("20130101 20:43:42.123456789", "ns"), + ("20130101", "D"), ], ) def test_date_format_frame(self, date, date_unit, datetime_frame): @@ -914,6 +915,7 @@ def test_date_format_frame_raises(self, datetime_frame): ("20130101 20:43:42.123", "ms"), ("20130101 20:43:42.123456", "us"), ("20130101 20:43:42.123456789", "ns"), + ("20130101", "D"), ], ) def test_date_format_series(self, date, date_unit, datetime_series): @@ -936,7 +938,7 @@ def test_date_format_series_raises(self, datetime_series): ts.to_json(date_format="iso", date_unit="foo") @pytest.mark.parametrize("unit", ["s", "ms", "us", "ns"]) - def test_date_unit(self, unit, datetime_frame): + def test_date_unit_time(self, unit, datetime_frame): df = datetime_frame df["date"] = Timestamp("20130101 20:43:42").as_unit("ns") dl = df.columns.get_loc("date") @@ -954,6 +956,37 @@ def test_date_unit(self, unit, datetime_frame): result = read_json(StringIO(json), date_unit=None) tm.assert_frame_equal(result, df) + def test_date_unit_day(self, datetime_frame: DataFrame): + # a different test is implemented for unit="D" + # since it needs some handling of the df, because unit + # is not autodetected by the read_json method + df = datetime_frame + df["date"] = Timestamp("20130102 20:43:42").as_unit("ns") + dl = df.columns.get_loc("date") + + df.iloc[1, dl] = Timestamp("19710102 20:43:42") + df.iloc[2, dl] = Timestamp("21460101 20:43:42") + df.iloc[4, dl] = pd.NaT + + jsonStr = df.to_json(date_format="epoch", date_unit="D") + + # remove time part since it doesn't get serialized + # so it won't be equal in the deserialized df + df["date"] = pd.to_datetime(df["date"].dt.date) + + parsed = json.loads(jsonStr) + jsonDate = Series(parsed["date"].values(), index=df.index) + dfDate = ( + df["date"].map(lambda d: d.timestamp() if not pd.isna(d) else None) + / 60 + / 60 + / 24 + ) + tm.assert_series_equal(jsonDate, dfDate, check_names=False) + + result = read_json(StringIO(jsonStr), date_unit="D") + tm.assert_frame_equal(result, df) + def test_weird_nested_json(self): # this used to core dump the parser s = r"""{ diff --git a/pandas/tests/io/json/test_ujson.py b/pandas/tests/io/json/test_ujson.py index 0df6b1eef72c0..8c44c0ae7e337 100644 --- a/pandas/tests/io/json/test_ujson.py +++ b/pandas/tests/io/json/test_ujson.py @@ -394,6 +394,9 @@ def test_datetime_units(self): val = datetime.datetime(2013, 8, 17, 21, 17, 12, 215504) stamp = Timestamp(val).as_unit("ns") + roundtrip = ujson.decode(ujson.encode(val, date_unit="D")) + assert roundtrip == stamp.value // (864 * 10**11) + roundtrip = ujson.decode(ujson.encode(val, date_unit="s")) assert roundtrip == stamp._value // 10**9