From 3e323d9cc28eda2550c3fcf79f5fd5c2800e785e Mon Sep 17 00:00:00 2001 From: Davide Canton Date: Thu, 26 Jan 2023 22:17:41 +0100 Subject: [PATCH 1/4] ENH: adding support for date_unit="D" (#16492) --- .../_libs/src/ujson/python/date_conversions.c | 3 +++ pandas/_libs/src/ujson/python/objToJSON.c | 4 ++- pandas/io/json/_json.py | 7 ++++-- pandas/tests/io/json/test_pandas.py | 25 ++++++++++++++++++- pandas/tests/io/json/test_ujson.py | 3 +++ 5 files changed, 38 insertions(+), 4 deletions(-) diff --git a/pandas/_libs/src/ujson/python/date_conversions.c b/pandas/_libs/src/ujson/python/date_conversions.c index 86cb68f869cb0..0f648b9b25eec 100644 --- a/pandas/_libs/src/ujson/python/date_conversions.c +++ b/pandas/_libs/src/ujson/python/date_conversions.c @@ -33,6 +33,9 @@ int scaleNanosecToUnit(npy_int64 *value, NPY_DATETIMEUNIT unit) { case NPY_FR_s: *value /= 1000000000LL; break; + case NPY_FR_D: + // 24 * 60 * 60 * 1000000000LL to convert to days + *value /= 86400000000000LL; default: return -1; } diff --git a/pandas/_libs/src/ujson/python/objToJSON.c b/pandas/_libs/src/ujson/python/objToJSON.c index 13b96f9f8fccd..85a02cff0b47e 100644 --- a/pandas/_libs/src/ujson/python/objToJSON.c +++ b/pandas/_libs/src/ujson/python/objToJSON.c @@ -2084,7 +2084,9 @@ PyObject *objToJSON(PyObject *Py_UNUSED(self), PyObject *args, pyEncoder.datetimeUnit = NPY_FR_us; } else if (strcmp(sdateFormat, "ns") == 0) { pyEncoder.datetimeUnit = NPY_FR_ns; - } else { + } else if (strcmp(sdateFormat, "D") == 0) { + pyEncoder.datetimeUnit = NPY_FR_D; + }else { PyErr_Format(PyExc_ValueError, "Invalid value '%s' for option 'date_unit'", sdateFormat); diff --git a/pandas/io/json/_json.py b/pandas/io/json/_json.py index fb64e089d53a8..7953aaf61cea5 100644 --- a/pandas/io/json/_json.py +++ b/pandas/io/json/_json.py @@ -1033,12 +1033,13 @@ class Parser: _split_keys: tuple[str, ...] _default_orient: str - _STAMP_UNITS = ("s", "ms", "us", "ns") + _STAMP_UNITS = ("s", "ms", "us", "ns", "D") _MIN_STAMPS = { "s": 31536000, "ms": 31536000000, "us": 31536000000000, "ns": 31536000000000000, + "D": 365, } def __init__( @@ -1063,7 +1064,9 @@ def __init__( self.dtype = dtype if date_unit is not None: - date_unit = date_unit.lower() + # avoid lowercasing "D" but ensure retrocompatibility for other units + if date_unit != "D": + date_unit = date_unit.lower() if date_unit not in self._STAMP_UNITS: raise ValueError(f"date_unit must be one of {self._STAMP_UNITS}") self.min_stamp = self._MIN_STAMPS[date_unit] diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index 7b473a56aa200..6c34c022f662d 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -835,6 +835,7 @@ def test_convert_dates_infer(self, infer_word): ("20130101 20:43:42.123", "ms"), ("20130101 20:43:42.123456", "us"), ("20130101 20:43:42.123456789", "ns"), + ("20130101", "D"), ], ) def test_date_format_frame(self, date, date_unit, datetime_frame): @@ -865,6 +866,7 @@ def test_date_format_frame_raises(self, datetime_frame): ("20130101 20:43:42.123", "ms"), ("20130101 20:43:42.123456", "us"), ("20130101 20:43:42.123456789", "ns"), + ("20130101", "D"), ], ) def test_date_format_series(self, date, date_unit, datetime_series): @@ -886,7 +888,7 @@ def test_date_format_series_raises(self, datetime_series): ts.to_json(date_format="iso", date_unit="foo") @pytest.mark.parametrize("unit", ["s", "ms", "us", "ns"]) - def test_date_unit(self, unit, datetime_frame): + def test_date_unit_time(self, unit, datetime_frame): df = datetime_frame df["date"] = Timestamp("20130101 20:43:42") dl = df.columns.get_loc("date") @@ -904,6 +906,27 @@ def test_date_unit(self, unit, datetime_frame): result = read_json(json, date_unit=None) tm.assert_frame_equal(result, df) + def test_date_unit_day(self, datetime_frame): + # a different test is implemented for unit="D" + # since it needs some handling of the df that unit + # is not autodetected by the read_json method + df = datetime_frame + df["date"] = Timestamp("20130102 20:43:42") + dl = df.columns.get_loc("date") + + df.iloc[1, dl] = Timestamp("19710102 20:43:42") + df.iloc[2, dl] = Timestamp("21460101 20:43:42") + df.iloc[4, dl] = pd.NaT + + json = df.to_json(date_format="epoch", date_unit="D") + + # remove time part since it doesn't get serialized + # so it won't be equal in the deserialized df + df["date"] = pd.to_datetime(df["date"].dt.date) + + result = read_json(json, date_unit="D") + tm.assert_frame_equal(result, df) + def test_weird_nested_json(self): # this used to core dump the parser s = r"""{ diff --git a/pandas/tests/io/json/test_ujson.py b/pandas/tests/io/json/test_ujson.py index a5d7a16f77a72..3cf6d59b34eec 100644 --- a/pandas/tests/io/json/test_ujson.py +++ b/pandas/tests/io/json/test_ujson.py @@ -394,6 +394,9 @@ def test_datetime_units(self): val = datetime.datetime(2013, 8, 17, 21, 17, 12, 215504) stamp = Timestamp(val).as_unit("ns") + roundtrip = ujson.decode(ujson.encode(val, date_unit="D")) + assert roundtrip == stamp.value // (864 * 10**11) + roundtrip = ujson.decode(ujson.encode(val, date_unit="s")) assert roundtrip == stamp.value // 10**9 From ba87eaa58afc1a844789e7114638a4fb68ee2431 Mon Sep 17 00:00:00 2001 From: Davide Canton Date: Thu, 26 Jan 2023 22:17:41 +0100 Subject: [PATCH 2/4] DOC: updated documentation for date_unit="D" (#16492) --- doc/source/user_guide/io.rst | 13 ++++++++++--- pandas/_libs/src/ujson/python/objToJSON.c | 2 +- pandas/core/generic.py | 4 ++-- pandas/io/json/_json.py | 6 ++++-- 4 files changed, 17 insertions(+), 8 deletions(-) diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index dc21b9f35d272..8f8e2cdbcc5bf 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -1864,7 +1864,7 @@ with optional parameters: * ``date_format`` : string, type of date conversion, 'epoch' for timestamp, 'iso' for ISO8601. * ``double_precision`` : The number of decimal places to use when encoding floating point values, default 10. * ``force_ascii`` : force encoded string to be ASCII, default True. -* ``date_unit`` : The time unit to encode to, governs timestamp and ISO8601 precision. One of 's', 'ms', 'us' or 'ns' for seconds, milliseconds, microseconds and nanoseconds respectively. Default 'ms'. +* ``date_unit`` : The time unit to encode to, governs timestamp and ISO8601 precision. One of 'D', 's', 'ms', 'us' or 'ns' for days, seconds, milliseconds, microseconds and nanoseconds respectively. Default 'ms'. * ``default_handler`` : The handler to call if an object cannot otherwise be converted to a suitable format for JSON. Takes a single argument, which is the object to convert, and returns a serializable object. * ``lines`` : If ``records`` orient, then will write each record per line as json. * ``mode`` : string, writer mode when writing to path. 'w' for write, 'a' for append. Default 'w' @@ -1964,6 +1964,13 @@ Writing in ISO date format, with microseconds: json = dfd.to_json(date_format="iso", date_unit="us") json +Writing in ISO date format, with just dates: + +.. ipython:: python + + json = dfd.to_json(date_format="iso", date_unit="D") + json + Epoch timestamps, in seconds: .. ipython:: python @@ -2064,8 +2071,8 @@ is ``None``. To explicitly force ``Series`` parsing, pass ``typ=series`` * ``precise_float`` : boolean, default ``False``. Set to enable usage of higher precision (strtod) function when decoding string to double values. Default (``False``) is to use fast but less precise builtin functionality. * ``date_unit`` : string, the timestamp unit to detect if converting dates. Default None. By default the timestamp precision will be detected, if this is not desired - then pass one of 's', 'ms', 'us' or 'ns' to force timestamp precision to - seconds, milliseconds, microseconds or nanoseconds respectively. + then pass one of 'D', 's', 'ms', 'us' or 'ns' to force timestamp precision to + days, seconds, milliseconds, microseconds or nanoseconds respectively. * ``lines`` : reads file as one json object per line. * ``encoding`` : The encoding to use to decode py3 bytes. * ``chunksize`` : when used in combination with ``lines=True``, return a JsonReader which reads in ``chunksize`` lines per iteration. diff --git a/pandas/_libs/src/ujson/python/objToJSON.c b/pandas/_libs/src/ujson/python/objToJSON.c index 85a02cff0b47e..2250536ddad91 100644 --- a/pandas/_libs/src/ujson/python/objToJSON.c +++ b/pandas/_libs/src/ujson/python/objToJSON.c @@ -2086,7 +2086,7 @@ PyObject *objToJSON(PyObject *Py_UNUSED(self), PyObject *args, pyEncoder.datetimeUnit = NPY_FR_ns; } else if (strcmp(sdateFormat, "D") == 0) { pyEncoder.datetimeUnit = NPY_FR_D; - }else { + } else { PyErr_Format(PyExc_ValueError, "Invalid value '%s' for option 'date_unit'", sdateFormat); diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 998c57b66509d..cac6621aff7f0 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -2349,8 +2349,8 @@ def to_json( Force encoded string to be ASCII. date_unit : str, default 'ms' (milliseconds) The time unit to encode to, governs timestamp and ISO8601 - precision. One of 's', 'ms', 'us', 'ns' for second, millisecond, - microsecond, and nanosecond respectively. + precision. One of 'D', 's', 'ms', 'us', 'ns' for days, + second, millisecond, microsecond, and nanosecond respectively. default_handler : callable, default None Handler to call if object cannot otherwise be converted to a suitable format for JSON. Should receive a single argument which is diff --git a/pandas/io/json/_json.py b/pandas/io/json/_json.py index 7953aaf61cea5..9a500efa9d0d8 100644 --- a/pandas/io/json/_json.py +++ b/pandas/io/json/_json.py @@ -596,8 +596,10 @@ def read_json( date_unit : str, default None The timestamp unit to detect if converting dates. The default behaviour is to try and detect the correct precision, but if this is not desired - then pass one of 's', 'ms', 'us' or 'ns' to force parsing only seconds, - milliseconds, microseconds or nanoseconds respectively. + then pass one of 'D', 's', 'ms', 'us' or 'ns' to force parsing only days, + seconds, milliseconds, microseconds or nanoseconds respectively. + Time unit (from 's' to 'ns') are automatically inferred from the input, + while the 'D' unit is not inferred and must be specified explicitly. encoding : str, default is 'utf-8' The encoding to use to decode py3 bytes. From f8d7654e3573194fa75d7926a37086b60e5b2975 Mon Sep 17 00:00:00 2001 From: Davide Canton Date: Sat, 15 Jul 2023 16:03:32 +0200 Subject: [PATCH 3/4] TST: Fixed test after merge of 2.0 (#16492) --- pandas/tests/io/json/test_pandas.py | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index b00988456e8f9..4c85f3379d204 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -956,25 +956,35 @@ def test_date_unit_time(self, unit, datetime_frame): result = read_json(StringIO(json), date_unit=None) tm.assert_frame_equal(result, df) - def test_date_unit_day(self, datetime_frame): + def test_date_unit_day(self, datetime_frame: DataFrame): # a different test is implemented for unit="D" - # since it needs some handling of the df that unit + # since it needs some handling of the df, because unit # is not autodetected by the read_json method df = datetime_frame - df["date"] = Timestamp("20130102 20:43:42") + df["date"] = Timestamp("20130102 20:43:42").as_unit("ns") dl = df.columns.get_loc("date") df.iloc[1, dl] = Timestamp("19710102 20:43:42") df.iloc[2, dl] = Timestamp("21460101 20:43:42") df.iloc[4, dl] = pd.NaT - json = df.to_json(date_format="epoch", date_unit="D") + jsonStr = df.to_json(date_format="epoch", date_unit="D") # remove time part since it doesn't get serialized # so it won't be equal in the deserialized df df["date"] = pd.to_datetime(df["date"].dt.date) - result = read_json(json, date_unit="D") + parsed = json.loads(jsonStr) + jsonDate = Series(parsed["date"].values(), index=df.index) + dfDate = ( + df["date"].map(lambda d: d.timestamp() if not pd.isna(d) else None) + / 60 + / 60 + / 24 + ) + tm.assert_series_equal(jsonDate, dfDate, check_names=False) + + result = read_json(StringIO(jsonStr), date_unit="D") tm.assert_frame_equal(result, df) def test_weird_nested_json(self): From 7c8ae7dc267a257711973ca61fc408330c1a7ad0 Mon Sep 17 00:00:00 2001 From: Davide Canton Date: Sat, 15 Jul 2023 16:03:57 +0200 Subject: [PATCH 4/4] BUG: added missing break in switch-case (#16492) --- pandas/_libs/src/datetime/date_conversions.c | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/_libs/src/datetime/date_conversions.c b/pandas/_libs/src/datetime/date_conversions.c index 7d7d7effae2b0..7184fe43d01d3 100644 --- a/pandas/_libs/src/datetime/date_conversions.c +++ b/pandas/_libs/src/datetime/date_conversions.c @@ -36,6 +36,7 @@ int scaleNanosecToUnit(npy_int64 *value, NPY_DATETIMEUNIT unit) { case NPY_FR_D: // 24 * 60 * 60 * 1000000000LL to convert to days *value /= 86400000000000LL; + break; default: return -1; }