Skip to content

Commit 5a38dca

Browse files
committed
ENH: can pass list of dicts to DataFrame constructor, support Cython code, #526
1 parent 6f4ca40 commit 5a38dca

File tree

9 files changed

+116
-12
lines changed

9 files changed

+116
-12
lines changed

RELEASE.rst

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,8 @@ pandas 0.6.2
3535
**New features / modules**
3636

3737
- Handle differently-indexed output values in ``DataFrame.apply`` (GH #498)
38+
- Can pass list of dicts (e.g., a list of shallow JSON objects) to DataFrame
39+
constructor (GH #526)
3840

3941
**Improvements to existing features**
4042

doc/source/groupby.rst

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -161,7 +161,7 @@ natural to group by one of the levels of the hierarchy.
161161
['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two']]
162162
tuples = zip(*arrays)
163163
tuples
164-
index = MultiIndex.from_tuples(tuples, names=['first', 'second'])
164+
g index = MultiIndex.from_tuples(tuples, names=['first', 'second'])
165165
s = Series(randn(8), index=index)
166166

167167
.. ipython:: python

pandas/core/frame.py

Lines changed: 20 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -209,9 +209,16 @@ def __init__(self, data=None, index=None, columns=None, dtype=None,
209209
mgr = self._init_ndarray(data, index, columns, dtype=dtype,
210210
copy=copy)
211211
elif isinstance(data, list):
212-
if len(data) > 0 and isinstance(data[0], (list, tuple)):
213-
data, columns = _list_to_sdict(data, columns)
214-
mgr = self._init_dict(data, index, columns, dtype=dtype)
212+
if len(data) > 0:
213+
if isinstance(data[0], (list, tuple)):
214+
data, columns = _list_to_sdict(data, columns)
215+
mgr = self._init_dict(data, index, columns, dtype=dtype)
216+
elif isinstance(data[0], dict):
217+
data, columns = _list_of_dict_to_sdict(data, columns)
218+
mgr = self._init_dict(data, index, columns, dtype=dtype)
219+
else:
220+
mgr = self._init_ndarray(data, index, columns, dtype=dtype,
221+
copy=copy)
215222
else:
216223
mgr = self._init_ndarray(data, index, columns, dtype=dtype,
217224
copy=copy)
@@ -3577,7 +3584,17 @@ def _list_to_sdict(data, columns):
35773584
if columns is None:
35783585
columns = []
35793586
return {}, columns
3587+
return _convert_object_array(content, columns)
3588+
3589+
def _list_of_dict_to_sdict(data, columns):
3590+
if columns is None:
3591+
gen = (x.keys() for x in data)
3592+
columns = lib.fast_unique_multiple_list_gen(gen)
3593+
3594+
content = list(lib.dicts_to_array(data, list(columns)).T)
3595+
return _convert_object_array(content, columns)
35803596

3597+
def _convert_object_array(content, columns):
35813598
if columns is None:
35823599
columns = range(len(content))
35833600
else:

pandas/src/tseries.pyx

Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -376,6 +376,57 @@ def fast_unique_multiple_list(list lists):
376376

377377
return uniques
378378

379+
@cython.wraparound(False)
380+
@cython.boundscheck(False)
381+
def fast_unique_multiple_list_gen(object gen):
382+
cdef:
383+
list buf
384+
Py_ssize_t j, n
385+
list uniques = []
386+
dict table = {}
387+
object val, stub = 0
388+
389+
for buf in gen:
390+
n = len(buf)
391+
for j from 0 <= j < n:
392+
val = buf[j]
393+
if val not in table:
394+
table[val] = stub
395+
uniques.append(val)
396+
397+
try:
398+
uniques.sort()
399+
except Exception:
400+
pass
401+
402+
return uniques
403+
404+
@cython.wraparound(False)
405+
@cython.boundscheck(False)
406+
def dicts_to_array(list dicts, list columns):
407+
cdef:
408+
Py_ssize_t i, j, k, n
409+
ndarray[object, ndim=2] result
410+
dict row
411+
object col, onan = np.nan
412+
413+
k = len(columns)
414+
n = len(dicts)
415+
416+
result = np.empty((n, k), dtype='O')
417+
418+
for i in range(n):
419+
row = dicts[i]
420+
for j in range(k):
421+
col = columns[j]
422+
if col in row:
423+
result[i, j] = row[col]
424+
else:
425+
result[i, j] = onan
426+
427+
return result
428+
429+
379430
def fast_zip(list ndarrays):
380431
'''
381432
For zipping multiple ndarrays into an ndarray of tuples

pandas/tests/test_frame.py

Lines changed: 18 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1243,6 +1243,23 @@ def test_constructor_list_of_lists(self):
12431243
self.assert_(com.is_integer_dtype(df['num']))
12441244
self.assert_(df['str'].dtype == np.object_)
12451245

1246+
def test_constructor_list_of_dicts(self):
1247+
data = [{'a': 1.5, 'b': 3, 'c':4, 'd':6},
1248+
{'a': 1.5, 'b': 3, 'd':6},
1249+
{'a': 1.5, 'd':6},
1250+
{},
1251+
{'a': 1.5, 'b': 3, 'c':4},
1252+
{'b': 3, 'c':4, 'd':6}]
1253+
1254+
result = DataFrame(data)
1255+
expected = DataFrame.from_dict(dict(zip(range(len(data)), data)),
1256+
orient='index')
1257+
assert_frame_equal(result, expected.reindex(result.index))
1258+
1259+
result = DataFrame([{}])
1260+
expected = DataFrame([])
1261+
assert_frame_equal(result, expected)
1262+
12461263
def test_constructor_ragged(self):
12471264
data = {'A' : randn(10),
12481265
'B' : randn(8)}
@@ -3752,7 +3769,7 @@ def test_reset_index(self):
37523769
deleveled2['level_1']))
37533770

37543771
# exception if no name
3755-
self.assertRaises(Exception, self.frame.delevel)
3772+
self.assertRaises(Exception, self.frame.reset_index)
37563773

37573774
# but this is ok
37583775
self.frame.index.name = 'index'

pandas/tests/test_groupby.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -786,7 +786,7 @@ def test_cython_agg_nothing_to_agg(self):
786786
self.assertRaises(GroupByError, frame.groupby('a')['b'].mean)
787787

788788
def test_grouping_attrs(self):
789-
deleveled = self.mframe.delevel()
789+
deleveled = self.mframe.reset_index()
790790
grouped = deleveled.groupby(['first', 'second'])
791791

792792
for i, ping in enumerate(grouped.groupings):
@@ -795,7 +795,7 @@ def test_grouping_attrs(self):
795795

796796
def test_groupby_level(self):
797797
frame = self.mframe
798-
deleveled = frame.delevel()
798+
deleveled = frame.reset_index()
799799

800800
result0 = frame.groupby(level=0).sum()
801801
result1 = frame.groupby(level=1).sum()
@@ -840,7 +840,7 @@ def test_groupby_level_apply(self):
840840

841841
def test_groupby_level_mapper(self):
842842
frame = self.mframe
843-
deleveled = frame.delevel()
843+
deleveled = frame.reset_index()
844844

845845
mapper0 = {'foo' : 0, 'bar' : 0,
846846
'baz' : 1, 'qux' : 1}

pandas/tests/test_multilevel.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -249,7 +249,7 @@ def test_sortlevel(self):
249249
# series
250250
a_sorted = self.frame['A'].sortlevel(0)
251251
self.assertRaises(Exception,
252-
self.frame.delevel()['A'].sortlevel)
252+
self.frame.reset_index()['A'].sortlevel)
253253

254254
# preserve names
255255
self.assertEquals(a_sorted.index.names, self.frame.index.names)
@@ -261,7 +261,7 @@ def test_delevel_infer_dtype(self):
261261
names=['prm0', 'prm1', 'prm2'])
262262
df = DataFrame(np.random.randn(8,3), columns=['A', 'B', 'C'],
263263
index=index)
264-
deleveled = df.delevel()
264+
deleveled = df.reset_index()
265265
self.assert_(com.is_integer_dtype(deleveled['prm1']))
266266
self.assert_(com.is_float_dtype(deleveled['prm2']))
267267

vb_suite/pandas_vb_common.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,3 +2,9 @@
22
import pandas.util.testing as tm
33
import random
44
import numpy as np
5+
6+
# didn't add to namespace until later
7+
try:
8+
from pandas.core.index import MultiIndex
9+
except ImportError:
10+
pass

vb_suite/reindex.py

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -57,13 +57,24 @@
5757
5858
ts = Series(np.random.randn(len(rng)), index=rng)
5959
ts2 = ts[::2]
60+
61+
def pad():
62+
try:
63+
ts2.reindex(ts.index, method='pad')
64+
except:
65+
ts2.reindex(ts.index, fillMethod='pad')
66+
def backfill():
67+
try:
68+
ts2.reindex(ts.index, method='backfill')
69+
except:
70+
ts2.reindex(ts.index, fillMethod='backfill')
6071
"""
6172

62-
statement = "ts2.reindex(ts.index, method='pad')"
73+
statement = "pad()"
6374
reindex_daterange_pad = Benchmark(statement, setup,
6475
name="reindex_daterange_pad")
6576

66-
statement = "ts2.reindex(ts.index, method='backfill')"
77+
statement = "backfill()"
6778
reindex_daterange_backfill = Benchmark(statement, setup,
6879
name="reindex_daterange_backfill")
6980

0 commit comments

Comments
 (0)