From bfa1dfa130bb27037e885f727ad6154353b3dac3 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Thu, 10 Oct 2019 11:48:34 -0700 Subject: [PATCH 01/17] Added test for failing tz roundtrip --- pandas/tests/io/json/test_pandas.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index 8e28740c70bad..f7a44a488a7b7 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -1137,6 +1137,15 @@ def test_datetime_tz(self): s_naive = Series(tz_naive) assert stz.to_json() == s_naive.to_json() + def test_datetime_tz_iso_maintains_offset(self, orient): + # GH 12997 + tz_range = pd.date_range("20130101", periods=3, tz="US/Eastern") + df = DataFrame(tz_range, columns=['date']) + result = pd.read_json(df.to_json(orient=orient, date_format="iso"), orient=orient) + expected = df.copy() + + assert_json_roundtrip_equal(result, expected, orient) + def test_sparse(self): # GH4377 df.to_json segfaults with non-ndarray blocks df = pd.DataFrame(np.random.randn(10, 4)) From 3281b80ef30e047765e0c8ecfe45c0c92a707e4e Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Thu, 10 Oct 2019 12:43:35 -0700 Subject: [PATCH 02/17] Vendored np datetime changes --- pandas/_libs/src/ujson/python/objToJSON.c | 5 +- .../_libs/tslibs/src/datetime/np_datetime.c | 18 + .../_libs/tslibs/src/datetime/np_datetime.h | 1 + .../tslibs/src/datetime/np_datetime_strings.c | 483 ++++++++++++++++-- .../tslibs/src/datetime/np_datetime_strings.h | 19 +- 5 files changed, 466 insertions(+), 60 deletions(-) diff --git a/pandas/_libs/src/ujson/python/objToJSON.c b/pandas/_libs/src/ujson/python/objToJSON.c index 48712dc68829d..88833fed0e099 100644 --- a/pandas/_libs/src/ujson/python/objToJSON.c +++ b/pandas/_libs/src/ujson/python/objToJSON.c @@ -477,7 +477,10 @@ static void *PandasDateTimeStructToJSON(npy_datetimestruct *dts, return NULL; } - if (!make_iso_8601_datetime(dts, GET_TC(tc)->cStr, *_outLen, base)) { + // The current make_iso_8601_datetime implementation requires you to provide local + // offsets in minutes + int tzoffset = -300; + if (!make_iso_8601_datetime(dts, GET_TC(tc)->cStr, *_outLen, 1, 0, base, tzoffset, 0)) { PRINTMARK(); *_outLen = strlen(GET_TC(tc)->cStr); return GET_TC(tc)->cStr; diff --git a/pandas/_libs/tslibs/src/datetime/np_datetime.c b/pandas/_libs/tslibs/src/datetime/np_datetime.c index a8a47e2e90f93..443333bf93b58 100644 --- a/pandas/_libs/tslibs/src/datetime/np_datetime.c +++ b/pandas/_libs/tslibs/src/datetime/np_datetime.c @@ -28,6 +28,24 @@ This file is derived from NumPy 1.7. See NUMPY_LICENSE.txt #include #include "np_datetime.h" +char *_datetime_strings[NPY_DATETIME_NUMUNITS] = { + "Y", + "M", + "W", + "", + "D", + "h", + "m", + "s", + "ms", + "us", + "ns", + "ps", + "fs", + "as", + "generic" +}; + #if PY_MAJOR_VERSION >= 3 #define PyInt_AsLong PyLong_AsLong #endif // PyInt_AsLong diff --git a/pandas/_libs/tslibs/src/datetime/np_datetime.h b/pandas/_libs/tslibs/src/datetime/np_datetime.h index 549d38409ca83..f6f11247d391d 100644 --- a/pandas/_libs/tslibs/src/datetime/np_datetime.h +++ b/pandas/_libs/tslibs/src/datetime/np_datetime.h @@ -48,6 +48,7 @@ void pandas_timedelta_to_timedeltastruct(npy_timedelta val, NPY_DATETIMEUNIT fr, pandas_timedeltastruct *result); +extern char *_datetime_strings[NPY_DATETIME_NUMUNITS]; extern const int days_per_month_table[2][12]; // stuff numpy-derived code needs in header diff --git a/pandas/_libs/tslibs/src/datetime/np_datetime_strings.c b/pandas/_libs/tslibs/src/datetime/np_datetime_strings.c index 54ed6ecff21e2..aba0583ee775d 100644 --- a/pandas/_libs/tslibs/src/datetime/np_datetime_strings.c +++ b/pandas/_libs/tslibs/src/datetime/np_datetime_strings.c @@ -37,6 +37,169 @@ This file implements string parsing and creation for NumPy datetime. #include "np_datetime.h" #include "np_datetime_strings.h" +/* + * Platform-specific time_t typedef. Some platforms use 32 bit, some use 64 bit + * and we just use the default with the exception of mingw, where we must use + * 64 bit because MSVCRT version 9 does not have the (32 bit) localtime() + * symbol, so we need to use the 64 bit version [1]. + * + * [1] http://thread.gmane.org/gmane.comp.gnu.mingw.user/27011 + */ +#if defined(NPY_MINGW_USE_CUSTOM_MSVCR) + typedef __time64_t NPY_TIME_T; +#else + typedef time_t NPY_TIME_T; +#endif + +/* + * Wraps `localtime` functionality for multiple platforms. This + * converts a time value to a time structure in the local timezone. + * If size(NPY_TIME_T) == 4, then years must be between 1970 and 2038. If + * size(NPY_TIME_T) == 8, then years must be later than 1970. If the years are + * not in this range, then get_localtime() will fail on some platforms. + * + * Returns 0 on success, -1 on failure. + * + * Notes: + * 1) If NPY_TIME_T is 32 bit (i.e. sizeof(NPY_TIME_T) == 4), then the + * maximum year it can represent is 2038 (see [1] for more details). Trying + * to use a higher date like 2041 in the 32 bit "ts" variable below will + * typically result in "ts" being a negative number (corresponding roughly + * to a year ~ 1905). If NPY_TIME_T is 64 bit, then there is no such + * problem in practice. + * 2) If the "ts" argument to localtime() is negative, it represents + * years < 1970 both for 32 and 64 bits (for 32 bits the earliest year it can + * represent is 1901, while 64 bits can represent much earlier years). + * 3) On Linux, localtime() works for negative "ts". On Windows and in Wine, + * localtime() as well as the localtime_s() and _localtime64_s() functions + * will fail for any negative "ts" and return a nonzero exit number + * (localtime_s, _localtime64_s) or NULL (localtime). This behavior is the + * same for both 32 and 64 bits. + * + * From this it follows that get_localtime() is only guaranteed to work + * correctly on all platforms for years between 1970 and 2038 for 32bit + * NPY_TIME_T and years higher than 1970 for 64bit NPY_TIME_T. For + * multiplatform code, get_localtime() should never be used outside of this + * range. + * + * [1] https://en.wikipedia.org/wiki/Year_2038_problem + */ +static int +get_localtime(NPY_TIME_T *ts, struct tm *tms) +{ + char *func_name = ""; +#if defined(_WIN32) + #if defined(_MSC_VER) && (_MSC_VER >= 1400) + if (localtime_s(tms, ts) != 0) { + func_name = "localtime_s"; + goto fail; + } + #elif defined(NPY_MINGW_USE_CUSTOM_MSVCR) + if (_localtime64_s(tms, ts) != 0) { + func_name = "_localtime64_s"; + goto fail; + } + #else + struct tm *tms_tmp; + tms_tmp = localtime(ts); + if (tms_tmp == NULL) { + func_name = "localtime"; + goto fail; + } + memcpy(tms, tms_tmp, sizeof(struct tm)); + #endif +#else + if (localtime_r(ts, tms) == NULL) { + func_name = "localtime_r"; + goto fail; + } +#endif + + return 0; + +fail: + PyErr_Format(PyExc_OSError, "Failed to use '%s' to convert " + "to a local time", func_name); + return -1; +} + + +/* + * Converts a datetimestruct in UTC to a datetimestruct in local time, + * also returning the timezone offset applied. This function works for any year + * > 1970 on all platforms and both 32 and 64 bits. If the year < 1970, then it + * will fail on some platforms. + * + * Returns 0 on success, -1 on failure. + */ +static int +convert_datetimestruct_utc_to_local(npy_datetimestruct *out_dts_local, + const npy_datetimestruct *dts_utc, int *out_timezone_offset) +{ + NPY_TIME_T rawtime = 0, localrawtime; + struct tm tm_; + npy_int64 year_correction = 0; + + /* Make a copy of the input 'dts' to modify */ + *out_dts_local = *dts_utc; + + /* + * For 32 bit NPY_TIME_T, the get_localtime() function does not work for + * years later than 2038, see the comments above get_localtime(). So if the + * year >= 2038, we instead call get_localtime() for the year 2036 or 2037 + * (depending on the leap year) which must work and at the end we add the + * 'year_correction' back. + */ + if (sizeof(NPY_TIME_T) == 4 && out_dts_local->year >= 2038) { + if (is_leapyear(out_dts_local->year)) { + /* 2036 is a leap year */ + year_correction = out_dts_local->year - 2036; + out_dts_local->year -= year_correction; /* = 2036 */ + } + else { + /* 2037 is not a leap year */ + year_correction = out_dts_local->year - 2037; + out_dts_local->year -= year_correction; /* = 2037 */ + } + } + + /* + * Convert everything in 'dts' to a time_t, to minutes precision. + * This is POSIX time, which skips leap-seconds, but because + * we drop the seconds value from the npy_datetimestruct, everything + * is ok for this operation. + */ + rawtime = (NPY_TIME_T)get_datetimestruct_days(out_dts_local) * 24 * 60 * 60; + rawtime += dts_utc->hour * 60 * 60; + rawtime += dts_utc->min * 60; + + /* localtime converts a 'time_t' into a local 'struct tm' */ + if (get_localtime(&rawtime, &tm_) < 0) { + /* This should only fail if year < 1970 on some platforms. */ + return -1; + } + + /* Copy back all the values except seconds */ + out_dts_local->min = tm_.tm_min; + out_dts_local->hour = tm_.tm_hour; + out_dts_local->day = tm_.tm_mday; + out_dts_local->month = tm_.tm_mon + 1; + out_dts_local->year = tm_.tm_year + 1900; + + /* Extract the timezone offset that was applied */ + rawtime /= 60; + localrawtime = (NPY_TIME_T)get_datetimestruct_days(out_dts_local) * 24 * 60; + localrawtime += out_dts_local->hour * 60; + localrawtime += out_dts_local->min; + + *out_timezone_offset = localrawtime - rawtime; + + /* Reapply the year 2038 year correction */ + out_dts_local->year += year_correction; + + return 0; +} + /* * Parses (almost) standard ISO 8601 date strings. The differences are: @@ -590,47 +753,211 @@ int get_datetime_iso_8601_strlen(int local, NPY_DATETIMEUNIT base) { } +/* + * Finds the largest unit whose value is nonzero, and for which + * the remainder for the rest of the units is zero. + */ +static NPY_DATETIMEUNIT +lossless_unit_from_datetimestruct(npy_datetimestruct *dts) +{ + if (dts->as % 1000 != 0) { + return NPY_FR_as; + } + else if (dts->as != 0) { + return NPY_FR_fs; + } + else if (dts->ps % 1000 != 0) { + return NPY_FR_ps; + } + else if (dts->ps != 0) { + return NPY_FR_ns; + } + else if (dts->us % 1000 != 0) { + return NPY_FR_us; + } + else if (dts->us != 0) { + return NPY_FR_ms; + } + else if (dts->sec != 0) { + return NPY_FR_s; + } + else if (dts->min != 0) { + return NPY_FR_m; + } + else if (dts->hour != 0) { + return NPY_FR_h; + } + else if (dts->day != 1) { + return NPY_FR_D; + } + else if (dts->month != 1) { + return NPY_FR_M; + } + else { + return NPY_FR_Y; + } +} + + /* * Converts an npy_datetimestruct to an (almost) ISO 8601 - * NULL-terminated string using timezone Z (UTC). If the string fits in - * the space exactly, it leaves out the NULL terminator and returns success. + * NULL-terminated string. If the string fits in the space exactly, + * it leaves out the NULL terminator and returns success. * * The differences from ISO 8601 are the 'NaT' string, and * the number of year digits is >= 4 instead of strictly 4. * + * If 'local' is non-zero, it produces a string in local time with + * a +-#### timezone offset. If 'local' is zero and 'utc' is non-zero, + * produce a string ending with 'Z' to denote UTC. By default, no time + * zone information is attached. + * * 'base' restricts the output to that unit. Set 'base' to * -1 to auto-detect a base after which all the values are zero. * + * 'tzoffset' is used if 'local' is enabled, and 'tzoffset' is + * set to a value other than -1. This is a manual override for + * the local time zone to use, as an offset in minutes. + * + * 'casting' controls whether data loss is allowed by truncating + * the data to a coarser unit. This interacts with 'local', slightly, + * in order to form a date unit string as a local time, the casting + * must be unsafe. + * * Returns 0 on success, -1 on failure (for example if the output * string was too short). */ -int make_iso_8601_datetime(npy_datetimestruct *dts, char *outstr, int outlen, - NPY_DATETIMEUNIT base) { +int +make_iso_8601_datetime(npy_datetimestruct *dts, char *outstr, npy_intp outlen, + int local, int utc, NPY_DATETIMEUNIT base, int tzoffset, + NPY_CASTING casting) +{ + npy_datetimestruct dts_local; + int timezone_offset = 0; + char *substr = outstr; - int sublen = outlen; - int tmplen; + npy_intp sublen = outlen; + npy_intp tmplen; + + /* Handle NaT, and treat a datetime with generic units as NaT */ + if (dts->year == NPY_DATETIME_NAT || base == NPY_FR_GENERIC) { + if (outlen < 3) { + goto string_too_short; + } + outstr[0] = 'N'; + outstr[1] = 'a'; + outstr[2] = 'T'; + if (outlen > 3) { + outstr[3] = '\0'; + } + + return 0; + } + + /* + * Only do local time within a reasonable year range. The years + * earlier than 1970 are not made local, because the Windows API + * raises an error when they are attempted (see the comments above the + * get_localtime() function). For consistency, this + * restriction is applied to all platforms. + * + * Note that this only affects how the datetime becomes a string. + * The result is still completely unambiguous, it only means + * that datetimes outside this range will not include a time zone + * when they are printed. + */ + if ((dts->year < 1970 || dts->year >= 10000) && tzoffset == -1) { + local = 0; + } + /* Automatically detect a good unit */ + if (base == NPY_FR_ERROR) { + base = lossless_unit_from_datetimestruct(dts); + /* + * If there's a timezone, use at least minutes precision, + * and never split up hours and minutes by default + */ + if ((base < NPY_FR_m && local) || base == NPY_FR_h) { + base = NPY_FR_m; + } + /* Don't split up dates by default */ + else if (base < NPY_FR_D) { + base = NPY_FR_D; + } + } /* * Print weeks with the same precision as days. * * TODO: Could print weeks with YYYY-Www format if the week * epoch is a Monday. */ - if (base == NPY_FR_W) { + else if (base == NPY_FR_W) { base = NPY_FR_D; } -/* YEAR */ -/* - * Can't use PyOS_snprintf, because it always produces a '\0' - * character at the end, and NumPy string types are permitted - * to have data all the way to the end of the buffer. - */ + /* Use the C API to convert from UTC to local time */ + if (local && tzoffset == -1) { + if (convert_datetimestruct_utc_to_local(&dts_local, dts, + &timezone_offset) < 0) { + return -1; + } + + /* Set dts to point to our local time instead of the UTC time */ + dts = &dts_local; + } + /* Use the manually provided tzoffset */ + else if (local) { + /* Make a copy of the npy_datetimestruct we can modify */ + dts_local = *dts; + dts = &dts_local; + + /* Set and apply the required timezone offset */ + timezone_offset = tzoffset; + add_minutes_to_datetimestruct(dts, timezone_offset); + } + + /* + * Now the datetimestruct data is in the final form for + * the string representation, so ensure that the data + * is being cast according to the casting rule. + */ + if (casting != NPY_UNSAFE_CASTING) { + /* Producing a date as a local time is always 'unsafe' */ + if (base <= NPY_FR_D && local) { + PyErr_SetString(PyExc_TypeError, "Cannot create a local " + "timezone-based date string from a NumPy " + "datetime without forcing 'unsafe' casting"); + return -1; + } + /* Only 'unsafe' and 'same_kind' allow data loss */ + else { + NPY_DATETIMEUNIT unitprec; + + unitprec = lossless_unit_from_datetimestruct(dts); + if (casting != NPY_SAME_KIND_CASTING && unitprec > base) { + PyErr_Format(PyExc_TypeError, "Cannot create a " + "string with unit precision '%s' " + "from the NumPy datetime, which has data at " + "unit precision '%s', " + "requires 'unsafe' or 'same_kind' casting", + _datetime_strings[base], + _datetime_strings[unitprec]); + return -1; + } + } + } + + /* YEAR */ + /* + * Can't use PyOS_snprintf, because it always produces a '\0' + * character at the end, and NumPy string types are permitted + * to have data all the way to the end of the buffer. + */ #ifdef _WIN32 tmplen = _snprintf(substr, sublen, "%04" NPY_INT64_FMT, dts->year); #else tmplen = snprintf(substr, sublen, "%04" NPY_INT64_FMT, dts->year); -#endif // _WIN32 +#endif /* If it ran out of space or there isn't space for the NULL terminator */ if (tmplen < 0 || tmplen > sublen) { goto string_too_short; @@ -647,15 +974,15 @@ int make_iso_8601_datetime(npy_datetimestruct *dts, char *outstr, int outlen, } /* MONTH */ - if (sublen < 1) { + if (sublen < 1 ) { goto string_too_short; } substr[0] = '-'; - if (sublen < 2) { + if (sublen < 2 ) { goto string_too_short; } substr[1] = (char)((dts->month / 10) + '0'); - if (sublen < 3) { + if (sublen < 3 ) { goto string_too_short; } substr[2] = (char)((dts->month % 10) + '0'); @@ -671,15 +998,15 @@ int make_iso_8601_datetime(npy_datetimestruct *dts, char *outstr, int outlen, } /* DAY */ - if (sublen < 1) { + if (sublen < 1 ) { goto string_too_short; } substr[0] = '-'; - if (sublen < 2) { + if (sublen < 2 ) { goto string_too_short; } substr[1] = (char)((dts->day / 10) + '0'); - if (sublen < 3) { + if (sublen < 3 ) { goto string_too_short; } substr[2] = (char)((dts->day % 10) + '0'); @@ -695,15 +1022,15 @@ int make_iso_8601_datetime(npy_datetimestruct *dts, char *outstr, int outlen, } /* HOUR */ - if (sublen < 1) { + if (sublen < 1 ) { goto string_too_short; } substr[0] = 'T'; - if (sublen < 2) { + if (sublen < 2 ) { goto string_too_short; } substr[1] = (char)((dts->hour / 10) + '0'); - if (sublen < 3) { + if (sublen < 3 ) { goto string_too_short; } substr[2] = (char)((dts->hour % 10) + '0'); @@ -716,15 +1043,15 @@ int make_iso_8601_datetime(npy_datetimestruct *dts, char *outstr, int outlen, } /* MINUTE */ - if (sublen < 1) { + if (sublen < 1 ) { goto string_too_short; } substr[0] = ':'; - if (sublen < 2) { + if (sublen < 2 ) { goto string_too_short; } substr[1] = (char)((dts->min / 10) + '0'); - if (sublen < 3) { + if (sublen < 3 ) { goto string_too_short; } substr[2] = (char)((dts->min % 10) + '0'); @@ -737,15 +1064,15 @@ int make_iso_8601_datetime(npy_datetimestruct *dts, char *outstr, int outlen, } /* SECOND */ - if (sublen < 1) { + if (sublen < 1 ) { goto string_too_short; } substr[0] = ':'; - if (sublen < 2) { + if (sublen < 2 ) { goto string_too_short; } substr[1] = (char)((dts->sec / 10) + '0'); - if (sublen < 3) { + if (sublen < 3 ) { goto string_too_short; } substr[2] = (char)((dts->sec % 10) + '0'); @@ -758,19 +1085,19 @@ int make_iso_8601_datetime(npy_datetimestruct *dts, char *outstr, int outlen, } /* MILLISECOND */ - if (sublen < 1) { + if (sublen < 1 ) { goto string_too_short; } substr[0] = '.'; - if (sublen < 2) { + if (sublen < 2 ) { goto string_too_short; } substr[1] = (char)((dts->us / 100000) % 10 + '0'); - if (sublen < 3) { + if (sublen < 3 ) { goto string_too_short; } substr[2] = (char)((dts->us / 10000) % 10 + '0'); - if (sublen < 4) { + if (sublen < 4 ) { goto string_too_short; } substr[3] = (char)((dts->us / 1000) % 10 + '0'); @@ -783,15 +1110,15 @@ int make_iso_8601_datetime(npy_datetimestruct *dts, char *outstr, int outlen, } /* MICROSECOND */ - if (sublen < 1) { + if (sublen < 1 ) { goto string_too_short; } substr[0] = (char)((dts->us / 100) % 10 + '0'); - if (sublen < 2) { + if (sublen < 2 ) { goto string_too_short; } substr[1] = (char)((dts->us / 10) % 10 + '0'); - if (sublen < 3) { + if (sublen < 3 ) { goto string_too_short; } substr[2] = (char)(dts->us % 10 + '0'); @@ -804,15 +1131,15 @@ int make_iso_8601_datetime(npy_datetimestruct *dts, char *outstr, int outlen, } /* NANOSECOND */ - if (sublen < 1) { + if (sublen < 1 ) { goto string_too_short; } substr[0] = (char)((dts->ps / 100000) % 10 + '0'); - if (sublen < 2) { + if (sublen < 2 ) { goto string_too_short; } substr[1] = (char)((dts->ps / 10000) % 10 + '0'); - if (sublen < 3) { + if (sublen < 3 ) { goto string_too_short; } substr[2] = (char)((dts->ps / 1000) % 10 + '0'); @@ -825,15 +1152,15 @@ int make_iso_8601_datetime(npy_datetimestruct *dts, char *outstr, int outlen, } /* PICOSECOND */ - if (sublen < 1) { + if (sublen < 1 ) { goto string_too_short; } substr[0] = (char)((dts->ps / 100) % 10 + '0'); - if (sublen < 2) { + if (sublen < 2 ) { goto string_too_short; } substr[1] = (char)((dts->ps / 10) % 10 + '0'); - if (sublen < 3) { + if (sublen < 3 ) { goto string_too_short; } substr[2] = (char)(dts->ps % 10 + '0'); @@ -846,15 +1173,15 @@ int make_iso_8601_datetime(npy_datetimestruct *dts, char *outstr, int outlen, } /* FEMTOSECOND */ - if (sublen < 1) { + if (sublen < 1 ) { goto string_too_short; } substr[0] = (char)((dts->as / 100000) % 10 + '0'); - if (sublen < 2) { + if (sublen < 2 ) { goto string_too_short; } substr[1] = (char)((dts->as / 10000) % 10 + '0'); - if (sublen < 3) { + if (sublen < 3 ) { goto string_too_short; } substr[2] = (char)((dts->as / 1000) % 10 + '0'); @@ -867,15 +1194,15 @@ int make_iso_8601_datetime(npy_datetimestruct *dts, char *outstr, int outlen, } /* ATTOSECOND */ - if (sublen < 1) { + if (sublen < 1 ) { goto string_too_short; } substr[0] = (char)((dts->as / 100) % 10 + '0'); - if (sublen < 2) { + if (sublen < 2 ) { goto string_too_short; } substr[1] = (char)((dts->as / 10) % 10 + '0'); - if (sublen < 3) { + if (sublen < 3 ) { goto string_too_short; } substr[2] = (char)(dts->as % 10 + '0'); @@ -883,13 +1210,57 @@ int make_iso_8601_datetime(npy_datetimestruct *dts, char *outstr, int outlen, sublen -= 3; add_time_zone: + if (local) { + /* Add the +/- sign */ + if (sublen < 1) { + goto string_too_short; + } + if (timezone_offset < 0) { + substr[0] = '-'; + timezone_offset = -timezone_offset; + } + else { + substr[0] = '+'; + } + substr += 1; + sublen -= 1; + + /* Add the timezone offset */ + if (sublen < 1 ) { + goto string_too_short; + } + substr[0] = (char)((timezone_offset / (10*60)) % 10 + '0'); + if (sublen < 2 ) { + goto string_too_short; + } + substr[1] = (char)((timezone_offset / 60) % 10 + '0'); + if (sublen < 3 ) { + goto string_too_short; + } + + // This is a modification to the vendored code to add a : separator + substr[2] = ':'; + if (sublen < 4 ) { + goto string_too_short; + } + substr[3] = (char)(((timezone_offset % 60) / 10) % 10 + '0'); + if (sublen < 5 ) { + goto string_too_short; + } + substr[4] = (char)((timezone_offset % 60) % 10 + '0'); + substr += 5; + sublen -= 5; + // End of modifications! + } /* UTC "Zulu" time */ - if (sublen < 1) { - goto string_too_short; + else if (utc) { + if (sublen < 1) { + goto string_too_short; + } + substr[0] = 'Z'; + substr += 1; + sublen -= 1; } - substr[0] = 'Z'; - substr += 1; - sublen -= 1; /* Add a NULL terminator, and return */ if (sublen > 0) { @@ -900,8 +1271,8 @@ int make_iso_8601_datetime(npy_datetimestruct *dts, char *outstr, int outlen, string_too_short: PyErr_Format(PyExc_RuntimeError, - "The string provided for NumPy ISO datetime formatting " - "was too short, with length %d", - outlen); + "The string provided for NumPy ISO datetime formatting " + "was too short, with length %"NPY_INTP_FMT, + outlen); return -1; } diff --git a/pandas/_libs/tslibs/src/datetime/np_datetime_strings.h b/pandas/_libs/tslibs/src/datetime/np_datetime_strings.h index 880c34ea77638..98368ce020000 100644 --- a/pandas/_libs/tslibs/src/datetime/np_datetime_strings.h +++ b/pandas/_libs/tslibs/src/datetime/np_datetime_strings.h @@ -68,15 +68,28 @@ get_datetime_iso_8601_strlen(int local, NPY_DATETIMEUNIT base); /* * Converts an npy_datetimestruct to an (almost) ISO 8601 - * NULL-terminated string using timezone Z (UTC). + * NULL-terminated string. + * + * If 'local' is non-zero, it produces a string in local time with + * a +-#### timezone offset, otherwise it uses timezone Z (UTC). * * 'base' restricts the output to that unit. Set 'base' to * -1 to auto-detect a base after which all the values are zero. * + * 'tzoffset' is used if 'local' is enabled, and 'tzoffset' is + * set to a value other than -1. This is a manual override for + * the local time zone to use, as an offset in minutes. + * + * 'casting' controls whether data loss is allowed by truncating + * the data to a coarser unit. This interacts with 'local', slightly, + * in order to form a date unit string as a local time, the casting + * must be unsafe. + * * Returns 0 on success, -1 on failure (for example if the output * string was too short). */ int -make_iso_8601_datetime(npy_datetimestruct *dts, char *outstr, int outlen, - NPY_DATETIMEUNIT base); +make_iso_8601_datetime(npy_datetimestruct *dts, char *outstr, npy_intp outlen, + int local, int utc, NPY_DATETIMEUNIT base, int tzoffset, + NPY_CASTING casting); #endif // PANDAS__LIBS_TSLIBS_SRC_DATETIME_NP_DATETIME_STRINGS_H_ From 6bb5640386504077a4ca10fdc655ce3c8170747f Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Thu, 10 Oct 2019 13:39:10 -0700 Subject: [PATCH 03/17] Working tz info --- pandas/_libs/src/ujson/python/objToJSON.c | 27 ++++++++++++------- .../_libs/tslibs/src/datetime/np_datetime.c | 2 +- .../_libs/tslibs/src/datetime/np_datetime.h | 2 +- 3 files changed, 20 insertions(+), 11 deletions(-) diff --git a/pandas/_libs/src/ujson/python/objToJSON.c b/pandas/_libs/src/ujson/python/objToJSON.c index 88833fed0e099..8d774c25e68e2 100644 --- a/pandas/_libs/src/ujson/python/objToJSON.c +++ b/pandas/_libs/src/ujson/python/objToJSON.c @@ -464,7 +464,7 @@ static void *PyUnicodeToUTF8(JSOBJ _obj, JSONTypeContext *tc, void *outValue, static void *PandasDateTimeStructToJSON(npy_datetimestruct *dts, JSONTypeContext *tc, void *outValue, - size_t *_outLen) { + size_t *_outLen, int offset_in_min) { NPY_DATETIMEUNIT base = ((PyObjectEncoder *)tc->encoder)->datetimeUnit; if (((PyObjectEncoder *)tc->encoder)->datetimeIso) { @@ -477,10 +477,7 @@ static void *PandasDateTimeStructToJSON(npy_datetimestruct *dts, return NULL; } - // The current make_iso_8601_datetime implementation requires you to provide local - // offsets in minutes - int tzoffset = -300; - if (!make_iso_8601_datetime(dts, GET_TC(tc)->cStr, *_outLen, 1, 0, base, tzoffset, 0)) { + if (!make_iso_8601_datetime(dts, GET_TC(tc)->cStr, *_outLen, 1, 0, base, offset_in_min, 0)) { PRINTMARK(); *_outLen = strlen(GET_TC(tc)->cStr); return GET_TC(tc)->cStr; @@ -508,19 +505,29 @@ static void *NpyDateTimeScalarToJSON(JSOBJ _obj, JSONTypeContext *tc, pandas_datetime_to_datetimestruct(obj->obval, (NPY_DATETIMEUNIT)obj->obmeta.base, &dts); - return PandasDateTimeStructToJSON(&dts, tc, outValue, _outLen); + return PandasDateTimeStructToJSON(&dts, tc, outValue, _outLen, 0); } static void *PyDateTimeToJSON(JSOBJ _obj, JSONTypeContext *tc, void *outValue, size_t *_outLen) { npy_datetimestruct dts; - PyDateTime_Date *obj = (PyDateTime_Date *)_obj; + PyDateTime_DateTime *obj = (PyDateTime_DateTime *)_obj; PRINTMARK(); if (!convert_pydatetime_to_datetimestruct(obj, &dts)) { PRINTMARK(); - return PandasDateTimeStructToJSON(&dts, tc, outValue, _outLen); + + // tz awareness gets lost when converting to pydatetime_datetime, so + // send separately to serialization function + PyObject *utcoffset = PyObject_CallMethod(_obj, "utcoffset", NULL); + PyObject *tot_seconds = PyObject_CallMethod(utcoffset, "total_seconds", NULL); + Py_DECREF(utcoffset); + + long offset_in_min = PyLong_AsLong(tot_seconds) / 60; + Py_DECREF(tot_seconds); + + return PandasDateTimeStructToJSON(&dts, tc, outValue, _outLen, offset_in_min); } else { if (!PyErr_Occurred()) { PyErr_SetString(PyExc_ValueError, @@ -538,7 +545,9 @@ static void *NpyDatetime64ToJSON(JSOBJ _obj, JSONTypeContext *tc, pandas_datetime_to_datetimestruct((npy_datetime)GET_TC(tc)->longValue, NPY_FR_ns, &dts); - return PandasDateTimeStructToJSON(&dts, tc, outValue, _outLen); + + // TODO: should the offset here be 0? + return PandasDateTimeStructToJSON(&dts, tc, outValue, _outLen, 0); } static void *PyTimeToJSON(JSOBJ _obj, JSONTypeContext *tc, void *outValue, diff --git a/pandas/_libs/tslibs/src/datetime/np_datetime.c b/pandas/_libs/tslibs/src/datetime/np_datetime.c index 443333bf93b58..c9af316b9a0dc 100644 --- a/pandas/_libs/tslibs/src/datetime/np_datetime.c +++ b/pandas/_libs/tslibs/src/datetime/np_datetime.c @@ -339,7 +339,7 @@ int cmp_npy_datetimestruct(const npy_datetimestruct *a, * Returns -1 on error, 0 on success, and 1 (with no error set) * if obj doesn't have the needed date or datetime attributes. */ -int convert_pydatetime_to_datetimestruct(PyDateTime_Date *dtobj, +int convert_pydatetime_to_datetimestruct(PyDateTime_DateTime *dtobj, npy_datetimestruct *out) { // Assumes that obj is a valid datetime object PyObject *tmp; diff --git a/pandas/_libs/tslibs/src/datetime/np_datetime.h b/pandas/_libs/tslibs/src/datetime/np_datetime.h index f6f11247d391d..32543b80f219f 100644 --- a/pandas/_libs/tslibs/src/datetime/np_datetime.h +++ b/pandas/_libs/tslibs/src/datetime/np_datetime.h @@ -35,7 +35,7 @@ extern const npy_datetimestruct _NS_MAX_DTS; // stuff pandas needs // ---------------------------------------------------------------------------- -int convert_pydatetime_to_datetimestruct(PyDateTime_Date *dtobj, +int convert_pydatetime_to_datetimestruct(PyDateTime_DateTime *dtobj, npy_datetimestruct *out); npy_datetime npy_datetimestruct_to_datetime(NPY_DATETIMEUNIT base, From ab4688b286bee2ec30c8ddbc2d35c0cd3061c24a Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Thu, 10 Oct 2019 13:40:03 -0700 Subject: [PATCH 04/17] Changed test expectation --- pandas/tests/io/json/test_pandas.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index f7a44a488a7b7..31887b71a094f 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -1141,10 +1141,9 @@ def test_datetime_tz_iso_maintains_offset(self, orient): # GH 12997 tz_range = pd.date_range("20130101", periods=3, tz="US/Eastern") df = DataFrame(tz_range, columns=['date']) - result = pd.read_json(df.to_json(orient=orient, date_format="iso"), orient=orient) - expected = df.copy() + result = df.to_json(orient=orient, date_format="iso") - assert_json_roundtrip_equal(result, expected, orient) + assert "2013-01-01T00:00:00.000-05:00" in result def test_sparse(self): # GH4377 df.to_json segfaults with non-ndarray blocks From 7caffc0188242f1932e44b6f0ca2068ed322fe61 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Thu, 10 Oct 2019 14:13:40 -0700 Subject: [PATCH 05/17] Handled utcoffset being None --- pandas/_libs/src/ujson/python/objToJSON.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/pandas/_libs/src/ujson/python/objToJSON.c b/pandas/_libs/src/ujson/python/objToJSON.c index 8d774c25e68e2..e23e172c891e1 100644 --- a/pandas/_libs/src/ujson/python/objToJSON.c +++ b/pandas/_libs/src/ujson/python/objToJSON.c @@ -520,12 +520,15 @@ static void *PyDateTimeToJSON(JSOBJ _obj, JSONTypeContext *tc, void *outValue, // tz awareness gets lost when converting to pydatetime_datetime, so // send separately to serialization function + long offset_in_min = 0; PyObject *utcoffset = PyObject_CallMethod(_obj, "utcoffset", NULL); - PyObject *tot_seconds = PyObject_CallMethod(utcoffset, "total_seconds", NULL); + if (utcoffset != Py_None) + { + PyObject *tot_seconds = PyObject_CallMethod(utcoffset, "total_seconds", NULL); + offset_in_min = PyLong_AsLong(tot_seconds) / 60; + Py_DECREF(tot_seconds); + } Py_DECREF(utcoffset); - - long offset_in_min = PyLong_AsLong(tot_seconds) / 60; - Py_DECREF(tot_seconds); return PandasDateTimeStructToJSON(&dts, tc, outValue, _outLen, offset_in_min); } else { From 412478f8fa3cfeed018d034116d2f5fd3227de47 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Thu, 10 Oct 2019 14:28:39 -0700 Subject: [PATCH 06/17] fixed wrong tests --- pandas/tests/io/json/test_pandas.py | 36 ++++++++++++----------------- 1 file changed, 15 insertions(+), 21 deletions(-) diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index 31887b71a094f..507bd2b09f418 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -1162,38 +1162,32 @@ def test_sparse(self): assert expected == ss.to_json() @pytest.mark.parametrize( - "ts", + "ts,expected", [ - Timestamp("2013-01-10 05:00:00Z"), - Timestamp("2013-01-10 00:00:00", tz="US/Eastern"), - Timestamp("2013-01-10 00:00:00-0500"), + (Timestamp("2013-01-10 05:00:00Z"), + '"2013-01-10T05:00:00.000+00:00"'), + (Timestamp("2013-01-10 00:00:00", tz="US/Eastern"), + '"2013-01-10T00:00:00.000-05:00"'), + (Timestamp("2013-01-10 00:00:00-0500"), + '"2013-01-10T00:00:00.000-05:00"') ], ) - def test_tz_is_utc(self, ts): + def test_tz_utc_offsets(self, ts, expected): from pandas.io.json import dumps - exp = '"2013-01-10T05:00:00.000Z"' - - assert dumps(ts, iso_dates=True) == exp + assert dumps(ts, iso_dates=True) == expected dt = ts.to_pydatetime() - assert dumps(dt, iso_dates=True) == exp + assert dumps(dt, iso_dates=True) == expected - @pytest.mark.parametrize( - "tz_range", - [ - pd.date_range("2013-01-01 05:00:00Z", periods=2), - pd.date_range("2013-01-01 00:00:00", periods=2, tz="US/Eastern"), - pd.date_range("2013-01-01 00:00:00-0500", periods=2), - ], - ) - def test_tz_range_is_utc(self, tz_range): + def test_tz_range_is_utc(self): from pandas.io.json import dumps - exp = '["2013-01-01T05:00:00.000Z","2013-01-02T05:00:00.000Z"]' + tz_range = pd.date_range("2013-01-01 05:00:00Z", periods=2) + exp = '["2013-01-01T05:00:00.000+00:00","2013-01-02T05:00:00.000+00:00"]' dfexp = ( '{"DT":{' - '"0":"2013-01-01T05:00:00.000Z",' - '"1":"2013-01-02T05:00:00.000Z"}}' + '"0":"2013-01-01T05:00:00.000+00:00",' + '"1":"2013-01-02T05:00:00.000+00:00"}}' ) assert dumps(tz_range, iso_dates=True) == exp From 647aff457a2e0071d3bcf23e11163f9a0168147e Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Thu, 10 Oct 2019 14:29:45 -0700 Subject: [PATCH 07/17] Moved test location --- pandas/tests/io/json/test_pandas.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index 507bd2b09f418..e73bc98cc1a59 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -1137,14 +1137,6 @@ def test_datetime_tz(self): s_naive = Series(tz_naive) assert stz.to_json() == s_naive.to_json() - def test_datetime_tz_iso_maintains_offset(self, orient): - # GH 12997 - tz_range = pd.date_range("20130101", periods=3, tz="US/Eastern") - df = DataFrame(tz_range, columns=['date']) - result = df.to_json(orient=orient, date_format="iso") - - assert "2013-01-01T00:00:00.000-05:00" in result - def test_sparse(self): # GH4377 df.to_json segfaults with non-ndarray blocks df = pd.DataFrame(np.random.randn(10, 4)) @@ -1197,6 +1189,14 @@ def test_tz_range_is_utc(self): result = dumps(df, iso_dates=True) assert result == dfexp + def test_datetime_tz_iso_maintains_offset(self, orient): + # GH 12997 + tz_range = pd.date_range("20130101", periods=3, tz="US/Eastern") + df = DataFrame(tz_range, columns=['date']) + result = df.to_json(orient=orient, date_format="iso") + + assert "2013-01-01T00:00:00.000-05:00" in result + def test_read_inline_jsonl(self): # GH9180 result = read_json('{"a": 1, "b": 2}\n{"b":2, "a" :1}\n', lines=True) From a75fa76febbc6f862367849e4f97a46e2cfce3c3 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Thu, 10 Oct 2019 14:30:50 -0700 Subject: [PATCH 08/17] black --- pandas/tests/io/json/test_pandas.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index e73bc98cc1a59..13621a9ce2480 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -1156,12 +1156,12 @@ def test_sparse(self): @pytest.mark.parametrize( "ts,expected", [ - (Timestamp("2013-01-10 05:00:00Z"), - '"2013-01-10T05:00:00.000+00:00"'), - (Timestamp("2013-01-10 00:00:00", tz="US/Eastern"), - '"2013-01-10T00:00:00.000-05:00"'), - (Timestamp("2013-01-10 00:00:00-0500"), - '"2013-01-10T00:00:00.000-05:00"') + (Timestamp("2013-01-10 05:00:00Z"), '"2013-01-10T05:00:00.000+00:00"'), + ( + Timestamp("2013-01-10 00:00:00", tz="US/Eastern"), + '"2013-01-10T00:00:00.000-05:00"', + ), + (Timestamp("2013-01-10 00:00:00-0500"), '"2013-01-10T00:00:00.000-05:00"'), ], ) def test_tz_utc_offsets(self, ts, expected): @@ -1192,10 +1192,10 @@ def test_tz_range_is_utc(self): def test_datetime_tz_iso_maintains_offset(self, orient): # GH 12997 tz_range = pd.date_range("20130101", periods=3, tz="US/Eastern") - df = DataFrame(tz_range, columns=['date']) + df = DataFrame(tz_range, columns=["date"]) result = df.to_json(orient=orient, date_format="iso") - assert "2013-01-01T00:00:00.000-05:00" in result + assert "2013-01-01T00:00:00.000-05:00" in result def test_read_inline_jsonl(self): # GH9180 From d29497cc9ac58cdce5384c60d8440333ba85b2bd Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Thu, 10 Oct 2019 14:52:44 -0700 Subject: [PATCH 09/17] Some docstrings --- pandas/_libs/src/ujson/python/objToJSON.c | 45 ++++++++++++++++++++++- 1 file changed, 44 insertions(+), 1 deletion(-) diff --git a/pandas/_libs/src/ujson/python/objToJSON.c b/pandas/_libs/src/ujson/python/objToJSON.c index e23e172c891e1..d7779d7ec2d6f 100644 --- a/pandas/_libs/src/ujson/python/objToJSON.c +++ b/pandas/_libs/src/ujson/python/objToJSON.c @@ -462,6 +462,27 @@ static void *PyUnicodeToUTF8(JSOBJ _obj, JSONTypeContext *tc, void *outValue, return PyBytes_AS_STRING(newObj); } +/* +Generic function to serialize date time structs to the appropriate JSON format. + +Parameters +---------- +npy_datetimestruct *dts : Pointer to a struct holding datetime information (year, month, day, etc...) +JSONTypeContext *tc : Pointer to the context for serialization +void *outValue : Pointer to a JSON serializable value +size_t *_outLen : For C-string output, the length of the string that needs to be accounted for +int offset_in_min : Number of minutes the npy_datetimestruct is offset from UTC + +Returns +------- +TODO : This returns a C String for ISO dates while also modifying the cStr for the type context. + That seems buggy and/or unnecessary? + +Notes +----- +In an ideal world we wouldn't have to handle offset_in_min separate from npy_datetimestruct. +Unfortunately npy_datetimestruct does not hold this info, so we pass it alongside the struct. +*/ static void *PandasDateTimeStructToJSON(npy_datetimestruct *dts, JSONTypeContext *tc, void *outValue, size_t *_outLen, int offset_in_min) { @@ -508,6 +529,27 @@ static void *NpyDateTimeScalarToJSON(JSOBJ _obj, JSONTypeContext *tc, return PandasDateTimeStructToJSON(&dts, tc, outValue, _outLen, 0); } +/* +Top level method for returning the conversion routine for serializing a datetimestruct to JSON. + +Parameters +---------- +JSOBJ _obj : In all actuality, this is a PyObject* passed from the Object_ type context; should be a datetime +JSONTypeContext *tc : Pointer to the Type Context at this point in serialization +void *outValue : Pointer to the serializable object; in this scope, can be either an integer or C-string, + depending on whether or not we are serializing dates to Unix epoch or ISO format +size_t *_outLen : Pointer to the C-string length of the serializable object; should be modified locally + +Returns +------- +Function pointer to appropriate serialization routine. + +Notes +----- +For iso_date formats, this passes a npy_datetimestruct to the appropriate conversion function. +Unfortunately the npy_datetimestuct does not have timezone awareness, so the offset from UTC in +minutes is passed instead. +*/ static void *PyDateTimeToJSON(JSOBJ _obj, JSONTypeContext *tc, void *outValue, size_t *_outLen) { npy_datetimestruct dts; @@ -549,7 +591,8 @@ static void *NpyDatetime64ToJSON(JSOBJ _obj, JSONTypeContext *tc, pandas_datetime_to_datetimestruct((npy_datetime)GET_TC(tc)->longValue, NPY_FR_ns, &dts); - // TODO: should the offset here be 0? + // Because this function is for numpy datetimes which by nature are not tz-aware + // we can pass the offset_in_min as 0 return PandasDateTimeStructToJSON(&dts, tc, outValue, _outLen, 0); } From 122a8fb036f9768a58912d46ad519b9c656d3060 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Thu, 10 Oct 2019 14:56:42 -0700 Subject: [PATCH 10/17] clang-format run --- pandas/_libs/src/ujson/python/objToJSON.c | 58 +++++++++++++---------- 1 file changed, 34 insertions(+), 24 deletions(-) diff --git a/pandas/_libs/src/ujson/python/objToJSON.c b/pandas/_libs/src/ujson/python/objToJSON.c index d7779d7ec2d6f..35b339bbe7f3d 100644 --- a/pandas/_libs/src/ujson/python/objToJSON.c +++ b/pandas/_libs/src/ujson/python/objToJSON.c @@ -436,7 +436,7 @@ static void *PyFloatToDOUBLE(JSOBJ _obj, JSONTypeContext *tc, void *outValue, } static void *PyBytesToUTF8(JSOBJ _obj, JSONTypeContext *tc, void *outValue, - size_t *_outLen) { + size_t *_outLen) { PyObject *obj = (PyObject *)_obj; *_outLen = PyBytes_GET_SIZE(obj); return PyBytes_AS_STRING(obj); @@ -467,21 +467,24 @@ Generic function to serialize date time structs to the appropriate JSON format. Parameters ---------- -npy_datetimestruct *dts : Pointer to a struct holding datetime information (year, month, day, etc...) +npy_datetimestruct *dts : Pointer to a struct holding datetime information + (year, month, day, etc...) JSONTypeContext *tc : Pointer to the context for serialization -void *outValue : Pointer to a JSON serializable value -size_t *_outLen : For C-string output, the length of the string that needs to be accounted for +void *outValue : Pointer to a JSON serializable value size_t +*_outLen : For C-string output, the length of the string that needs to be + accounted for. int offset_in_min : Number of minutes the npy_datetimestruct is offset from UTC Returns ------- -TODO : This returns a C String for ISO dates while also modifying the cStr for the type context. - That seems buggy and/or unnecessary? +TODO : This returns a C String for ISO dates while also modifying the cStr for + the type context. That seems buggy and/or unnecessary? Notes ----- -In an ideal world we wouldn't have to handle offset_in_min separate from npy_datetimestruct. -Unfortunately npy_datetimestruct does not hold this info, so we pass it alongside the struct. +In an ideal world we wouldn't have to handle offset_in_min separate from +npy_datetimestruct. Unfortunately npy_datetimestruct does not hold this info, so +we pass it alongside the struct. */ static void *PandasDateTimeStructToJSON(npy_datetimestruct *dts, JSONTypeContext *tc, void *outValue, @@ -498,7 +501,8 @@ static void *PandasDateTimeStructToJSON(npy_datetimestruct *dts, return NULL; } - if (!make_iso_8601_datetime(dts, GET_TC(tc)->cStr, *_outLen, 1, 0, base, offset_in_min, 0)) { + if (!make_iso_8601_datetime(dts, GET_TC(tc)->cStr, *_outLen, 1, 0, base, + offset_in_min, 0)) { PRINTMARK(); *_outLen = strlen(GET_TC(tc)->cStr); return GET_TC(tc)->cStr; @@ -530,15 +534,20 @@ static void *NpyDateTimeScalarToJSON(JSOBJ _obj, JSONTypeContext *tc, } /* -Top level method for returning the conversion routine for serializing a datetimestruct to JSON. +Top level method for returning the conversion routine for serializing a +datetimestruct to JSON. Parameters ---------- -JSOBJ _obj : In all actuality, this is a PyObject* passed from the Object_ type context; should be a datetime +JSOBJ _obj : In all actuality, this is a PyObject* passed from the Object_ type + context; should be a datetime JSONTypeContext *tc : Pointer to the Type Context at this point in serialization -void *outValue : Pointer to the serializable object; in this scope, can be either an integer or C-string, - depending on whether or not we are serializing dates to Unix epoch or ISO format -size_t *_outLen : Pointer to the C-string length of the serializable object; should be modified locally +void *outValue : Pointer to the serializable object; in this scope, can be + either an integer or C-string, + depending on whether or not we are serializing dates to Unix epoch or ISO + format +size_t *_outLen : Pointer to the C-string length of the serializable object. + Should be modified within function body. Returns ------- @@ -546,9 +555,9 @@ Function pointer to appropriate serialization routine. Notes ----- -For iso_date formats, this passes a npy_datetimestruct to the appropriate conversion function. -Unfortunately the npy_datetimestuct does not have timezone awareness, so the offset from UTC in -minutes is passed instead. +For iso_date formats, this passes a npy_datetimestruct to the appropriate +conversion function. Unfortunately the npy_datetimestuct does not have timezone +awareness, so the offset from UTC in minutes is passed instead. */ static void *PyDateTimeToJSON(JSOBJ _obj, JSONTypeContext *tc, void *outValue, size_t *_outLen) { @@ -564,15 +573,16 @@ static void *PyDateTimeToJSON(JSOBJ _obj, JSONTypeContext *tc, void *outValue, // send separately to serialization function long offset_in_min = 0; PyObject *utcoffset = PyObject_CallMethod(_obj, "utcoffset", NULL); - if (utcoffset != Py_None) - { - PyObject *tot_seconds = PyObject_CallMethod(utcoffset, "total_seconds", NULL); + if (utcoffset != Py_None) { + PyObject *tot_seconds = + PyObject_CallMethod(utcoffset, "total_seconds", NULL); offset_in_min = PyLong_AsLong(tot_seconds) / 60; Py_DECREF(tot_seconds); - } + } Py_DECREF(utcoffset); - return PandasDateTimeStructToJSON(&dts, tc, outValue, _outLen, offset_in_min); + return PandasDateTimeStructToJSON(&dts, tc, outValue, _outLen, + offset_in_min); } else { if (!PyErr_Occurred()) { PyErr_SetString(PyExc_ValueError, @@ -591,8 +601,8 @@ static void *NpyDatetime64ToJSON(JSOBJ _obj, JSONTypeContext *tc, pandas_datetime_to_datetimestruct((npy_datetime)GET_TC(tc)->longValue, NPY_FR_ns, &dts); - // Because this function is for numpy datetimes which by nature are not tz-aware - // we can pass the offset_in_min as 0 + // Because this function is for numpy datetimes which by nature are not + // tz-aware we can pass the offset_in_min as 0 return PandasDateTimeStructToJSON(&dts, tc, outValue, _outLen, 0); } From 62ac65e0f16dacc90c2bd26a51eff04395b994b6 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Thu, 10 Oct 2019 14:58:33 -0700 Subject: [PATCH 11/17] Whatsnew --- doc/source/whatsnew/v1.0.0.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index cde2a4279cf27..47dc65b2afc6b 100644 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -314,6 +314,7 @@ I/O - Bug in :func:`read_hdf` closing stores that it didn't open when Exceptions are raised (:issue:`28699`) - Bug in :meth:`DataFrame.read_json` where using ``orient="index"`` would not maintain the order (:issue:`28557`) - Bug in :meth:`DataFrame.to_html` where the length of the ``formatters`` argument was not verified (:issue:`28469`) +- Bug in :meth:`DataFrame.to_json` where timezone-aware dates were converted to UTC (:issue:`12997`) Plotting ^^^^^^^^ From 6bd2aaa84d7b7827f5bfdb1806b67bc4f2a0d4d4 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Thu, 10 Oct 2019 15:11:13 -0700 Subject: [PATCH 12/17] Error handling and formatting fixup --- pandas/_libs/src/ujson/python/objToJSON.c | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/pandas/_libs/src/ujson/python/objToJSON.c b/pandas/_libs/src/ujson/python/objToJSON.c index 35b339bbe7f3d..303abb7e640c6 100644 --- a/pandas/_libs/src/ujson/python/objToJSON.c +++ b/pandas/_libs/src/ujson/python/objToJSON.c @@ -569,16 +569,25 @@ static void *PyDateTimeToJSON(JSOBJ _obj, JSONTypeContext *tc, void *outValue, if (!convert_pydatetime_to_datetimestruct(obj, &dts)) { PRINTMARK(); - // tz awareness gets lost when converting to pydatetime_datetime, so - // send separately to serialization function long offset_in_min = 0; PyObject *utcoffset = PyObject_CallMethod(_obj, "utcoffset", NULL); - if (utcoffset != Py_None) { + + if (utcoffset == NULL) + return PyErr_NoMemory(); + + else if (utcoffset != Py_None) { PyObject *tot_seconds = PyObject_CallMethod(utcoffset, "total_seconds", NULL); + + if (tot_seconds == NULL) { + Py_DECREF(utcoffset); + return PyErr_NoMemory(); + } + offset_in_min = PyLong_AsLong(tot_seconds) / 60; Py_DECREF(tot_seconds); } + Py_DECREF(utcoffset); return PandasDateTimeStructToJSON(&dts, tc, outValue, _outLen, From cbf45bda60e0d96769e864adf0a8bb254121f3dc Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Thu, 10 Oct 2019 17:24:19 -0700 Subject: [PATCH 13/17] Test fixups --- .../tests/io/json/test_json_table_schema.py | 22 +++++++++---------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/pandas/tests/io/json/test_json_table_schema.py b/pandas/tests/io/json/test_json_table_schema.py index 569e299860614..32d045f82010e 100644 --- a/pandas/tests/io/json/test_json_table_schema.py +++ b/pandas/tests/io/json/test_json_table_schema.py @@ -285,12 +285,12 @@ def test_to_json(self): ("idx", 0), ("A", 1), ("B", "a"), - ("C", "2016-01-01T00:00:00.000Z"), + ("C", "2016-01-01T00:00:00.000+00:00"), ("D", "P0DT1H0M0S"), ("E", "a"), ("F", "a"), ("G", 1.0), - ("H", "2016-01-01T06:00:00.000Z"), + ("H", "2016-01-01T00:00:00.000-06:00"), ] ), OrderedDict( @@ -298,12 +298,12 @@ def test_to_json(self): ("idx", 1), ("A", 2), ("B", "b"), - ("C", "2016-01-02T00:00:00.000Z"), + ("C", "2016-01-02T00:00:00.000+00:00"), ("D", "P0DT1H1M0S"), ("E", "b"), ("F", "b"), ("G", 2.0), - ("H", "2016-01-02T06:00:00.000Z"), + ("H", "2016-01-02T00:00:00.000-06:00"), ] ), OrderedDict( @@ -311,12 +311,12 @@ def test_to_json(self): ("idx", 2), ("A", 3), ("B", "c"), - ("C", "2016-01-03T00:00:00.000Z"), + ("C", "2016-01-03T00:00:00.000+00:00"), ("D", "P0DT1H2M0S"), ("E", "c"), ("F", "c"), ("G", 3.0), - ("H", "2016-01-03T06:00:00.000Z"), + ("H", "2016-01-03T00:00:00.000-06:00"), ] ), OrderedDict( @@ -324,12 +324,12 @@ def test_to_json(self): ("idx", 3), ("A", 4), ("B", "c"), - ("C", "2016-01-04T00:00:00.000Z"), + ("C", "2016-01-04T00:00:00.000+00:00"), ("D", "P0DT1H3M0S"), ("E", "c"), ("F", "c"), ("G", 4.0), - ("H", "2016-01-04T06:00:00.000Z"), + ("H", "2016-01-04T00:00:00.000-06:00"), ] ), ] @@ -381,8 +381,8 @@ def test_to_json_period_index(self): schema = {"fields": fields, "primaryKey": ["index"]} data = [ - OrderedDict([("index", "2015-11-01T00:00:00.000Z"), ("values", 1)]), - OrderedDict([("index", "2016-02-01T00:00:00.000Z"), ("values", 1)]), + OrderedDict([("index", "2015-11-01T00:00:00.000+00:00"), ("values", 1)]), + OrderedDict([("index", "2016-02-01T00:00:00.000+00:00"), ("values", 1)]), ] expected = OrderedDict([("schema", schema), ("data", data)]) @@ -612,7 +612,7 @@ def test_timestamp_in_columns(self): ) result = df.to_json(orient="table") js = json.loads(result) - assert js["schema"]["fields"][1]["name"] == "2016-01-01T00:00:00.000Z" + assert js["schema"]["fields"][1]["name"] == "2016-01-01T00:00:00.000+00:00" # TODO - below expectation is not correct; see GH 28256 assert js["schema"]["fields"][2]["name"] == 10000 From 448f690986aa8b040ea6ed3e54e5dd41d751e3f5 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Thu, 10 Oct 2019 17:49:37 -0700 Subject: [PATCH 14/17] Fixed MemErr clears --- pandas/_libs/src/ujson/python/objToJSON.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/_libs/src/ujson/python/objToJSON.c b/pandas/_libs/src/ujson/python/objToJSON.c index 303abb7e640c6..e607f36f6216a 100644 --- a/pandas/_libs/src/ujson/python/objToJSON.c +++ b/pandas/_libs/src/ujson/python/objToJSON.c @@ -573,15 +573,15 @@ static void *PyDateTimeToJSON(JSOBJ _obj, JSONTypeContext *tc, void *outValue, PyObject *utcoffset = PyObject_CallMethod(_obj, "utcoffset", NULL); if (utcoffset == NULL) - return PyErr_NoMemory(); - + return NULL; + } else if (utcoffset != Py_None) { PyObject *tot_seconds = PyObject_CallMethod(utcoffset, "total_seconds", NULL); if (tot_seconds == NULL) { Py_DECREF(utcoffset); - return PyErr_NoMemory(); + return NULL; } offset_in_min = PyLong_AsLong(tot_seconds) / 60; From 8153eb170ac95f434181a005610463b32716bd04 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Thu, 10 Oct 2019 20:13:04 -0700 Subject: [PATCH 15/17] Fixed segfault --- pandas/_libs/src/ujson/python/objToJSON.c | 32 ++++++++++++++--------- 1 file changed, 19 insertions(+), 13 deletions(-) diff --git a/pandas/_libs/src/ujson/python/objToJSON.c b/pandas/_libs/src/ujson/python/objToJSON.c index e607f36f6216a..7bd20c32bd741 100644 --- a/pandas/_libs/src/ujson/python/objToJSON.c +++ b/pandas/_libs/src/ujson/python/objToJSON.c @@ -572,23 +572,29 @@ static void *PyDateTimeToJSON(JSOBJ _obj, JSONTypeContext *tc, void *outValue, long offset_in_min = 0; PyObject *utcoffset = PyObject_CallMethod(_obj, "utcoffset", NULL); - if (utcoffset == NULL) - return NULL; - } - else if (utcoffset != Py_None) { - PyObject *tot_seconds = - PyObject_CallMethod(utcoffset, "total_seconds", NULL); - - if (tot_seconds == NULL) { - Py_DECREF(utcoffset); + if (utcoffset == NULL) { + if (PyErr_ExceptionMatches(PyExc_AttributeError)) { + // 'datetime.date' object has no attribute 'utcoffset' + PyErr_Clear(); + } else { + // Propogate any other errors return NULL; } + } else { + if (utcoffset != Py_None) { + PyObject *tot_seconds = + PyObject_CallMethod(utcoffset, "total_seconds", NULL); - offset_in_min = PyLong_AsLong(tot_seconds) / 60; - Py_DECREF(tot_seconds); - } + if (tot_seconds == NULL) { + Py_DECREF(utcoffset); + return NULL; + } - Py_DECREF(utcoffset); + offset_in_min = PyLong_AsLong(tot_seconds) / 60; + Py_DECREF(tot_seconds); + } + Py_DECREF(utcoffset); + } return PandasDateTimeStructToJSON(&dts, tc, outValue, _outLen, offset_in_min); From 7b5f658f07e27b3ca0a5e5fcc17844e32f4f7836 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Thu, 10 Oct 2019 20:23:02 -0700 Subject: [PATCH 16/17] Removed modification to vendored code for colon --- .../tslibs/src/datetime/np_datetime_strings.c | 17 +++++--------- .../tests/io/json/test_json_table_schema.py | 22 +++++++++---------- pandas/tests/io/json/test_pandas.py | 14 ++++++------ 3 files changed, 23 insertions(+), 30 deletions(-) diff --git a/pandas/_libs/tslibs/src/datetime/np_datetime_strings.c b/pandas/_libs/tslibs/src/datetime/np_datetime_strings.c index aba0583ee775d..417e3f6a77217 100644 --- a/pandas/_libs/tslibs/src/datetime/np_datetime_strings.c +++ b/pandas/_libs/tslibs/src/datetime/np_datetime_strings.c @@ -1237,20 +1237,13 @@ make_iso_8601_datetime(npy_datetimestruct *dts, char *outstr, npy_intp outlen, if (sublen < 3 ) { goto string_too_short; } - - // This is a modification to the vendored code to add a : separator - substr[2] = ':'; - if (sublen < 4 ) { - goto string_too_short; - } - substr[3] = (char)(((timezone_offset % 60) / 10) % 10 + '0'); - if (sublen < 5 ) { + substr[2] = (char)(((timezone_offset % 60) / 10) % 10 + '0'); + if (sublen < 3 ) { goto string_too_short; } - substr[4] = (char)((timezone_offset % 60) % 10 + '0'); - substr += 5; - sublen -= 5; - // End of modifications! + substr[3] = (char)((timezone_offset % 60) % 10 + '0'); + substr += 4; + sublen -= 4; } /* UTC "Zulu" time */ else if (utc) { diff --git a/pandas/tests/io/json/test_json_table_schema.py b/pandas/tests/io/json/test_json_table_schema.py index 32d045f82010e..0dc3f172433df 100644 --- a/pandas/tests/io/json/test_json_table_schema.py +++ b/pandas/tests/io/json/test_json_table_schema.py @@ -285,12 +285,12 @@ def test_to_json(self): ("idx", 0), ("A", 1), ("B", "a"), - ("C", "2016-01-01T00:00:00.000+00:00"), + ("C", "2016-01-01T00:00:00.000+0000"), ("D", "P0DT1H0M0S"), ("E", "a"), ("F", "a"), ("G", 1.0), - ("H", "2016-01-01T00:00:00.000-06:00"), + ("H", "2016-01-01T00:00:00.000-0600"), ] ), OrderedDict( @@ -298,12 +298,12 @@ def test_to_json(self): ("idx", 1), ("A", 2), ("B", "b"), - ("C", "2016-01-02T00:00:00.000+00:00"), + ("C", "2016-01-02T00:00:00.000+0000"), ("D", "P0DT1H1M0S"), ("E", "b"), ("F", "b"), ("G", 2.0), - ("H", "2016-01-02T00:00:00.000-06:00"), + ("H", "2016-01-02T00:00:00.000-0600"), ] ), OrderedDict( @@ -311,12 +311,12 @@ def test_to_json(self): ("idx", 2), ("A", 3), ("B", "c"), - ("C", "2016-01-03T00:00:00.000+00:00"), + ("C", "2016-01-03T00:00:00.000+0000"), ("D", "P0DT1H2M0S"), ("E", "c"), ("F", "c"), ("G", 3.0), - ("H", "2016-01-03T00:00:00.000-06:00"), + ("H", "2016-01-03T00:00:00.000-0600"), ] ), OrderedDict( @@ -324,12 +324,12 @@ def test_to_json(self): ("idx", 3), ("A", 4), ("B", "c"), - ("C", "2016-01-04T00:00:00.000+00:00"), + ("C", "2016-01-04T00:00:00.000+0000"), ("D", "P0DT1H3M0S"), ("E", "c"), ("F", "c"), ("G", 4.0), - ("H", "2016-01-04T00:00:00.000-06:00"), + ("H", "2016-01-04T00:00:00.000-0600"), ] ), ] @@ -381,8 +381,8 @@ def test_to_json_period_index(self): schema = {"fields": fields, "primaryKey": ["index"]} data = [ - OrderedDict([("index", "2015-11-01T00:00:00.000+00:00"), ("values", 1)]), - OrderedDict([("index", "2016-02-01T00:00:00.000+00:00"), ("values", 1)]), + OrderedDict([("index", "2015-11-01T00:00:00.000+0000"), ("values", 1)]), + OrderedDict([("index", "2016-02-01T00:00:00.000+0000"), ("values", 1)]), ] expected = OrderedDict([("schema", schema), ("data", data)]) @@ -612,7 +612,7 @@ def test_timestamp_in_columns(self): ) result = df.to_json(orient="table") js = json.loads(result) - assert js["schema"]["fields"][1]["name"] == "2016-01-01T00:00:00.000+00:00" + assert js["schema"]["fields"][1]["name"] == "2016-01-01T00:00:00.000+0000" # TODO - below expectation is not correct; see GH 28256 assert js["schema"]["fields"][2]["name"] == 10000 diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index 13621a9ce2480..0844f25f6edec 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -1156,12 +1156,12 @@ def test_sparse(self): @pytest.mark.parametrize( "ts,expected", [ - (Timestamp("2013-01-10 05:00:00Z"), '"2013-01-10T05:00:00.000+00:00"'), + (Timestamp("2013-01-10 05:00:00Z"), '"2013-01-10T05:00:00.000+0000"'), ( Timestamp("2013-01-10 00:00:00", tz="US/Eastern"), - '"2013-01-10T00:00:00.000-05:00"', + '"2013-01-10T00:00:00.000-0500"', ), - (Timestamp("2013-01-10 00:00:00-0500"), '"2013-01-10T00:00:00.000-05:00"'), + (Timestamp("2013-01-10 00:00:00-0500"), '"2013-01-10T00:00:00.000-0500"'), ], ) def test_tz_utc_offsets(self, ts, expected): @@ -1175,11 +1175,11 @@ def test_tz_range_is_utc(self): from pandas.io.json import dumps tz_range = pd.date_range("2013-01-01 05:00:00Z", periods=2) - exp = '["2013-01-01T05:00:00.000+00:00","2013-01-02T05:00:00.000+00:00"]' + exp = '["2013-01-01T05:00:00.000+0000","2013-01-02T05:00:00.000+0000"]' dfexp = ( '{"DT":{' - '"0":"2013-01-01T05:00:00.000+00:00",' - '"1":"2013-01-02T05:00:00.000+00:00"}}' + '"0":"2013-01-01T05:00:00.000+0000",' + '"1":"2013-01-02T05:00:00.000+0000"}}' ) assert dumps(tz_range, iso_dates=True) == exp @@ -1195,7 +1195,7 @@ def test_datetime_tz_iso_maintains_offset(self, orient): df = DataFrame(tz_range, columns=["date"]) result = df.to_json(orient=orient, date_format="iso") - assert "2013-01-01T00:00:00.000-05:00" in result + assert "2013-01-01T00:00:00.000-0500" in result def test_read_inline_jsonl(self): # GH9180 From 7558d07580ab44cd6d0ed9debdcb81972190b73f Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Thu, 10 Oct 2019 21:41:45 -0700 Subject: [PATCH 17/17] Added benchmarks: --- asv_bench/benchmarks/io/json.py | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/asv_bench/benchmarks/io/json.py b/asv_bench/benchmarks/io/json.py index 5c1d39776b91c..4b7a8bfd103b3 100644 --- a/asv_bench/benchmarks/io/json.py +++ b/asv_bench/benchmarks/io/json.py @@ -67,10 +67,11 @@ class ToJSON(BaseIO): params = [ ["split", "columns", "index", "values", "records"], ["df", "df_date_idx", "df_td_int_ts", "df_int_floats", "df_int_float_str"], + ["epoch", "iso"], ] - param_names = ["orient", "frame"] + param_names = ["orient", "frame", "date_format"] - def setup(self, orient, frame): + def setup(self, orient, frame, date_format): N = 10 ** 5 ncols = 5 index = date_range("20000101", periods=N, freq="H") @@ -115,21 +116,21 @@ def setup(self, orient, frame): index=index, ) - def time_to_json(self, orient, frame): - getattr(self, frame).to_json(self.fname, orient=orient) + def time_to_json(self, orient, frame, date_format): + getattr(self, frame).to_json(self.fname, orient=orient, date_format=date_format) - def peakmem_to_json(self, orient, frame): - getattr(self, frame).to_json(self.fname, orient=orient) + def peakmem_to_json(self, orient, frame, date_format): + getattr(self, frame).to_json(self.fname, orient=orient, date_format=date_format) - def time_to_json_wide(self, orient, frame): + def time_to_json_wide(self, orient, frame, date_format): base_df = getattr(self, frame).copy() df = concat([base_df.iloc[:100]] * 1000, ignore_index=True, axis=1) - df.to_json(self.fname, orient=orient) + df.to_json(self.fname, orient=orient, date_format=date_format) - def peakmem_to_json_wide(self, orient, frame): + def peakmem_to_json_wide(self, orient, frame, date_format): base_df = getattr(self, frame).copy() df = concat([base_df.iloc[:100]] * 1000, ignore_index=True, axis=1) - df.to_json(self.fname, orient=orient) + df.to_json(self.fname, orient=orient, date_format=date_format) class ToJSONLines(BaseIO):