Skip to content

Commit 9b453e0

Browse files
committed
Merge pull request #9130 from Komnomnomnom/json-block-support
PERF: json support for blocks GH9037
2 parents 2114741 + a67bef4 commit 9b453e0

File tree

7 files changed

+884
-195
lines changed

7 files changed

+884
-195
lines changed

doc/source/whatsnew/v0.16.0.txt

+17-1
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,22 @@ Backwards incompatible API changes
2828
.. _whatsnew_0160.api_breaking:
2929

3030
- ``Index.duplicated`` now returns `np.array(dtype=bool)` rather than `Index(dtype=object)` containing `bool` values. (:issue:`8875`)
31+
- ``DataFrame.to_json`` now returns accurate type serialisation for each column for frames of mixed dtype (:issue:`9037`)
32+
33+
Previously data was coerced to a common dtype before serialisation, which for
34+
example resulted in integers being serialised to floats:
35+
36+
.. code-block:: python
37+
38+
In [2]: pd.DataFrame({'i': [1,2], 'f': [3.0, 4.2]}).to_json()
39+
Out[2]: '{"f":{"0":3.0,"1":4.2},"i":{"0":1.0,"1":2.0}}'
40+
41+
Now each column is serialised using its correct dtype:
42+
43+
.. code-block:: python
44+
45+
In [2]: pd.DataFrame({'i': [1,2], 'f': [3.0, 4.2]}).to_json()
46+
Out[2]: '{"f":{"0":3.0,"1":4.2},"i":{"0":1,"1":2}}'
3147

3248
Deprecations
3349
~~~~~~~~~~~~
@@ -46,10 +62,10 @@ Performance
4662
.. _whatsnew_0160.performance:
4763

4864
- Fixed a performance regression for ``.loc`` indexing with an array or list-like (:issue:`9126`:).
65+
- ``DataFrame.to_json`` 30x performance improvement for mixed dtype frames. (:issue:`9037`)
4966
- Performance improvements in ``MultiIndex.duplicated`` by working with labels instead of values (:issue:`9125`)
5067
- Improved the speed of `nunique` by calling `unique` instead of `value_counts` (:issue:`9129`, :issue:`7771`)
5168

52-
5369
Bug Fixes
5470
~~~~~~~~~
5571

pandas/core/generic.py

+1-9
Original file line numberDiff line numberDiff line change
@@ -2160,20 +2160,12 @@ def as_blocks(self):
21602160
Convert the frame to a dict of dtype -> Constructor Types that each has
21612161
a homogeneous dtype.
21622162
2163-
are presented in sorted order unless a specific list of columns is
2164-
provided.
2165-
21662163
NOTE: the dtypes of the blocks WILL BE PRESERVED HERE (unlike in
21672164
as_matrix)
21682165
2169-
Parameters
2170-
----------
2171-
columns : array-like
2172-
Specific column order
2173-
21742166
Returns
21752167
-------
2176-
values : a list of Object
2168+
values : a dict of dtype -> Constructor Types
21772169
"""
21782170
self._consolidate_inplace()
21792171

pandas/io/tests/test_json/test_pandas.py

+32-2
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
# pylint: disable-msg=W0612,E1101
2-
from pandas.compat import range, lrange, StringIO
2+
from pandas.compat import range, lrange, StringIO, OrderedDict
33
from pandas import compat
44
import os
55

@@ -337,14 +337,44 @@ def test_v12_compat(self):
337337

338338
v12_json = os.path.join(self.dirpath, 'tsframe_v012.json')
339339
df_unser = pd.read_json(v12_json)
340-
df_unser = pd.read_json(v12_json)
341340
assert_frame_equal(df, df_unser)
342341

343342
df_iso = df.drop(['modified'], axis=1)
344343
v12_iso_json = os.path.join(self.dirpath, 'tsframe_iso_v012.json')
345344
df_unser_iso = pd.read_json(v12_iso_json)
346345
assert_frame_equal(df_iso, df_unser_iso)
347346

347+
def test_blocks_compat_GH9037(self):
348+
index = pd.date_range('20000101', periods=10, freq='H')
349+
df_mixed = DataFrame(OrderedDict(
350+
float_1=[-0.92077639, 0.77434435, 1.25234727, 0.61485564,
351+
-0.60316077, 0.24653374, 0.28668979, -2.51969012,
352+
0.95748401, -1.02970536],
353+
int_1=[19680418, 75337055, 99973684, 65103179, 79373900,
354+
40314334, 21290235, 4991321, 41903419, 16008365],
355+
str_1=['78c608f1', '64a99743', '13d2ff52', 'ca7f4af2', '97236474',
356+
'bde7e214', '1a6bde47', 'b1190be5', '7a669144', '8d64d068'],
357+
float_2=[-0.0428278, -1.80872357, 3.36042349, -0.7573685,
358+
-0.48217572, 0.86229683, 1.08935819, 0.93898739,
359+
-0.03030452, 1.43366348],
360+
str_2=['14f04af9', 'd085da90', '4bcfac83', '81504caf', '2ffef4a9',
361+
'08e2f5c4', '07e1af03', 'addbd4a7', '1f6a09ba', '4bfc4d87'],
362+
int_2=[86967717, 98098830, 51927505, 20372254, 12601730, 20884027,
363+
34193846, 10561746, 24867120, 76131025]
364+
), index=index)
365+
366+
# JSON deserialisation always creates unicode strings
367+
df_mixed.columns = df_mixed.columns.astype('unicode')
368+
369+
df_roundtrip = pd.read_json(df_mixed.to_json(orient='split'),
370+
orient='split')
371+
assert_frame_equal(df_mixed, df_roundtrip,
372+
check_index_type=True,
373+
check_column_type=True,
374+
check_frame_type=True,
375+
by_blocks=True,
376+
check_exact=True)
377+
348378
def test_series_non_unique_index(self):
349379
s = Series(['a', 'b'], index=[1, 1])
350380

pandas/src/datetime_helper.h

+4-1
Original file line numberDiff line numberDiff line change
@@ -13,8 +13,11 @@ void mangle_nat(PyObject *val) {
1313
}
1414

1515
npy_int64 get_long_attr(PyObject *o, const char *attr) {
16+
npy_int64 long_val;
1617
PyObject *value = PyObject_GetAttrString(o, attr);
17-
return PyLong_Check(value) ? PyLong_AsLongLong(value) : PyInt_AS_LONG(value);
18+
long_val = (PyLong_Check(value) ? PyLong_AsLongLong(value) : PyInt_AS_LONG(value));
19+
Py_DECREF(value);
20+
return long_val;
1821
}
1922

2023
npy_float64 total_seconds(PyObject *td) {

0 commit comments

Comments
 (0)