Skip to content

Add support for date_unit="D" in read_json and to_json #51000

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 10 additions & 3 deletions doc/source/user_guide/io.rst
Original file line number Diff line number Diff line change
Expand Up @@ -1859,7 +1859,7 @@ with optional parameters:
* ``date_format`` : string, type of date conversion, 'epoch' for timestamp, 'iso' for ISO8601.
* ``double_precision`` : The number of decimal places to use when encoding floating point values, default 10.
* ``force_ascii`` : force encoded string to be ASCII, default True.
* ``date_unit`` : The time unit to encode to, governs timestamp and ISO8601 precision. One of 's', 'ms', 'us' or 'ns' for seconds, milliseconds, microseconds and nanoseconds respectively. Default 'ms'.
* ``date_unit`` : The time unit to encode to, governs timestamp and ISO8601 precision. One of 'D', 's', 'ms', 'us' or 'ns' for days, seconds, milliseconds, microseconds and nanoseconds respectively. Default 'ms'.
* ``default_handler`` : The handler to call if an object cannot otherwise be converted to a suitable format for JSON. Takes a single argument, which is the object to convert, and returns a serializable object.
* ``lines`` : If ``records`` orient, then will write each record per line as json.
* ``mode`` : string, writer mode when writing to path. 'w' for write, 'a' for append. Default 'w'
Expand Down Expand Up @@ -1959,6 +1959,13 @@ Writing in ISO date format, with microseconds:
json = dfd.to_json(date_format="iso", date_unit="us")
json

Writing in ISO date format, with just dates:

.. ipython:: python

json = dfd.to_json(date_format="iso", date_unit="D")
json

Epoch timestamps, in seconds:

.. ipython:: python
Expand Down Expand Up @@ -2059,8 +2066,8 @@ is ``None``. To explicitly force ``Series`` parsing, pass ``typ=series``
* ``precise_float`` : boolean, default ``False``. Set to enable usage of higher precision (strtod) function when decoding string to double values. Default (``False``) is to use fast but less precise builtin functionality.
* ``date_unit`` : string, the timestamp unit to detect if converting dates. Default
None. By default the timestamp precision will be detected, if this is not desired
then pass one of 's', 'ms', 'us' or 'ns' to force timestamp precision to
seconds, milliseconds, microseconds or nanoseconds respectively.
then pass one of 'D', 's', 'ms', 'us' or 'ns' to force timestamp precision to
days, seconds, milliseconds, microseconds or nanoseconds respectively.
* ``lines`` : reads file as one json object per line.
* ``encoding`` : The encoding to use to decode py3 bytes.
* ``chunksize`` : when used in combination with ``lines=True``, return a ``pandas.api.typing.JsonReader`` which reads in ``chunksize`` lines per iteration.
Expand Down
4 changes: 4 additions & 0 deletions pandas/_libs/src/datetime/date_conversions.c
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,10 @@ int scaleNanosecToUnit(npy_int64 *value, NPY_DATETIMEUNIT unit) {
case NPY_FR_s:
*value /= 1000000000LL;
break;
case NPY_FR_D:
// 24 * 60 * 60 * 1000000000LL to convert to days
*value /= 86400000000000LL;
break;
default:
return -1;
}
Expand Down
2 changes: 2 additions & 0 deletions pandas/_libs/src/vendored/ujson/python/objToJSON.c
Original file line number Diff line number Diff line change
Expand Up @@ -2089,6 +2089,8 @@ PyObject *objToJSON(PyObject *Py_UNUSED(self), PyObject *args,
pyEncoder.datetimeUnit = NPY_FR_us;
} else if (strcmp(sdateFormat, "ns") == 0) {
pyEncoder.datetimeUnit = NPY_FR_ns;
} else if (strcmp(sdateFormat, "D") == 0) {
pyEncoder.datetimeUnit = NPY_FR_D;
} else {
PyErr_Format(PyExc_ValueError,
"Invalid value '%s' for option 'date_unit'",
Expand Down
4 changes: 2 additions & 2 deletions pandas/core/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -2426,8 +2426,8 @@ def to_json(
Force encoded string to be ASCII.
date_unit : str, default 'ms' (milliseconds)
The time unit to encode to, governs timestamp and ISO8601
precision. One of 's', 'ms', 'us', 'ns' for second, millisecond,
microsecond, and nanosecond respectively.
precision. One of 'D', 's', 'ms', 'us', 'ns' for days,
second, millisecond, microsecond, and nanosecond respectively.
default_handler : callable, default None
Handler to call if object cannot otherwise be converted to a
suitable format for JSON. Should receive a single argument which is
Expand Down
13 changes: 9 additions & 4 deletions pandas/io/json/_json.py
Original file line number Diff line number Diff line change
Expand Up @@ -622,8 +622,10 @@ def read_json(
date_unit : str, default None
The timestamp unit to detect if converting dates. The default behaviour
is to try and detect the correct precision, but if this is not desired
then pass one of 's', 'ms', 'us' or 'ns' to force parsing only seconds,
milliseconds, microseconds or nanoseconds respectively.
then pass one of 'D', 's', 'ms', 'us' or 'ns' to force parsing only days,
seconds, milliseconds, microseconds or nanoseconds respectively.
Time unit (from 's' to 'ns') are automatically inferred from the input,
while the 'D' unit is not inferred and must be specified explicitly.

encoding : str, default is 'utf-8'
The encoding to use to decode py3 bytes.
Expand Down Expand Up @@ -1115,12 +1117,13 @@ class Parser:
_split_keys: tuple[str, ...]
_default_orient: str

_STAMP_UNITS = ("s", "ms", "us", "ns")
_STAMP_UNITS = ("s", "ms", "us", "ns", "D")
_MIN_STAMPS = {
"s": 31536000,
"ms": 31536000000,
"us": 31536000000000,
"ns": 31536000000000000,
"D": 365,
}

def __init__(
Expand All @@ -1145,7 +1148,9 @@ def __init__(
self.dtype = dtype

if date_unit is not None:
date_unit = date_unit.lower()
# avoid lowercasing "D" but ensure retrocompatibility for other units
if date_unit != "D":
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why not just put a lowercase D in _STAMP_UNITS?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I wanted to keep the exact same format passed to the underlying function

date_unit = date_unit.lower()
if date_unit not in self._STAMP_UNITS:
raise ValueError(f"date_unit must be one of {self._STAMP_UNITS}")
self.min_stamp = self._MIN_STAMPS[date_unit]
Expand Down
35 changes: 34 additions & 1 deletion pandas/tests/io/json/test_pandas.py
Original file line number Diff line number Diff line change
Expand Up @@ -883,6 +883,7 @@ def test_convert_dates_infer(self, infer_word):
("20130101 20:43:42.123", "ms"),
("20130101 20:43:42.123456", "us"),
("20130101 20:43:42.123456789", "ns"),
("20130101", "D"),
],
)
def test_date_format_frame(self, date, date_unit, datetime_frame):
Expand Down Expand Up @@ -914,6 +915,7 @@ def test_date_format_frame_raises(self, datetime_frame):
("20130101 20:43:42.123", "ms"),
("20130101 20:43:42.123456", "us"),
("20130101 20:43:42.123456789", "ns"),
("20130101", "D"),
],
)
def test_date_format_series(self, date, date_unit, datetime_series):
Expand All @@ -936,7 +938,7 @@ def test_date_format_series_raises(self, datetime_series):
ts.to_json(date_format="iso", date_unit="foo")

@pytest.mark.parametrize("unit", ["s", "ms", "us", "ns"])
def test_date_unit(self, unit, datetime_frame):
def test_date_unit_time(self, unit, datetime_frame):
df = datetime_frame
df["date"] = Timestamp("20130101 20:43:42").as_unit("ns")
dl = df.columns.get_loc("date")
Expand All @@ -954,6 +956,37 @@ def test_date_unit(self, unit, datetime_frame):
result = read_json(StringIO(json), date_unit=None)
tm.assert_frame_equal(result, df)

def test_date_unit_day(self, datetime_frame: DataFrame):
# a different test is implemented for unit="D"
# since it needs some handling of the df, because unit
# is not autodetected by the read_json method
df = datetime_frame
df["date"] = Timestamp("20130102 20:43:42").as_unit("ns")
dl = df.columns.get_loc("date")

df.iloc[1, dl] = Timestamp("19710102 20:43:42")
df.iloc[2, dl] = Timestamp("21460101 20:43:42")
df.iloc[4, dl] = pd.NaT

jsonStr = df.to_json(date_format="epoch", date_unit="D")

# remove time part since it doesn't get serialized
# so it won't be equal in the deserialized df
df["date"] = pd.to_datetime(df["date"].dt.date)

parsed = json.loads(jsonStr)
jsonDate = Series(parsed["date"].values(), index=df.index)
dfDate = (
df["date"].map(lambda d: d.timestamp() if not pd.isna(d) else None)
/ 60
/ 60
/ 24
)
tm.assert_series_equal(jsonDate, dfDate, check_names=False)

result = read_json(StringIO(jsonStr), date_unit="D")
tm.assert_frame_equal(result, df)

def test_weird_nested_json(self):
# this used to core dump the parser
s = r"""{
Expand Down
3 changes: 3 additions & 0 deletions pandas/tests/io/json/test_ujson.py
Original file line number Diff line number Diff line change
Expand Up @@ -394,6 +394,9 @@ def test_datetime_units(self):
val = datetime.datetime(2013, 8, 17, 21, 17, 12, 215504)
stamp = Timestamp(val).as_unit("ns")

roundtrip = ujson.decode(ujson.encode(val, date_unit="D"))
assert roundtrip == stamp.value // (864 * 10**11)

roundtrip = ujson.decode(ujson.encode(val, date_unit="s"))
assert roundtrip == stamp._value // 10**9

Expand Down