ENH: can pass list of dicts to DataFrame constructor, support Cython code, #526

wesm · wesm · commit 5a38dcaad29e · 2011-12-22T19:36:41.000-05:00
diff --git a/RELEASE.rst b/RELEASE.rst
@@ -35,6 +35,8 @@ pandas 0.6.2
 **New features / modules**
 
   - Handle differently-indexed output values in ``DataFrame.apply`` (GH #498)
+  - Can pass list of dicts (e.g., a list of shallow JSON objects) to DataFrame
+    constructor (GH #526)
 
 **Improvements to existing features**
 
diff --git a/doc/source/groupby.rst b/doc/source/groupby.rst
@@ -161,7 +161,7 @@ natural to group by one of the levels of the hierarchy.
              ['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two']]
    tuples = zip(*arrays)
    tuples
-   index = MultiIndex.from_tuples(tuples, names=['first', 'second'])
+g  index = MultiIndex.from_tuples(tuples, names=['first', 'second'])
    s = Series(randn(8), index=index)
 
 .. ipython:: python
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -209,9 +209,16 @@ def __init__(self, data=None, index=None, columns=None, dtype=None,
                 mgr = self._init_ndarray(data, index, columns, dtype=dtype,
                                          copy=copy)
         elif isinstance(data, list):
-            if len(data) > 0 and isinstance(data[0], (list, tuple)):
-                data, columns = _list_to_sdict(data, columns)
-                mgr = self._init_dict(data, index, columns, dtype=dtype)
+            if len(data) > 0:
+                if isinstance(data[0], (list, tuple)):
+                    data, columns = _list_to_sdict(data, columns)
+                    mgr = self._init_dict(data, index, columns, dtype=dtype)
+                elif isinstance(data[0], dict):
+                    data, columns = _list_of_dict_to_sdict(data, columns)
+                    mgr = self._init_dict(data, index, columns, dtype=dtype)
+                else:
+                    mgr = self._init_ndarray(data, index, columns, dtype=dtype,
+                                             copy=copy)
             else:
                 mgr = self._init_ndarray(data, index, columns, dtype=dtype,
                                          copy=copy)
@@ -3577,7 +3584,17 @@ def _list_to_sdict(data, columns):
         if columns is None:
             columns = []
         return {}, columns
+    return _convert_object_array(content, columns)
+
+def _list_of_dict_to_sdict(data, columns):
+    if columns is None:
+        gen = (x.keys() for x in data)
+        columns = lib.fast_unique_multiple_list_gen(gen)
+
+    content = list(lib.dicts_to_array(data, list(columns)).T)
+    return _convert_object_array(content, columns)
 
+def _convert_object_array(content, columns):
     if columns is None:
         columns = range(len(content))
     else:
diff --git a/pandas/src/tseries.pyx b/pandas/src/tseries.pyx
@@ -376,6 +376,57 @@ def fast_unique_multiple_list(list lists):
 
     return uniques
 
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def fast_unique_multiple_list_gen(object gen):
+    cdef:
+        list buf
+        Py_ssize_t j, n
+        list uniques = []
+        dict table = {}
+        object val, stub = 0
+
+    for buf in gen:
+        n = len(buf)
+        for j from 0 <= j < n:
+            val = buf[j]
+            if val not in table:
+                table[val] = stub
+                uniques.append(val)
+
+    try:
+        uniques.sort()
+    except Exception:
+        pass
+
+    return uniques
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def dicts_to_array(list dicts, list columns):
+    cdef:
+        Py_ssize_t i, j, k, n
+        ndarray[object, ndim=2] result
+        dict row
+        object col, onan = np.nan
+
+    k = len(columns)
+    n = len(dicts)
+
+    result = np.empty((n, k), dtype='O')
+
+    for i in range(n):
+        row = dicts[i]
+        for j in range(k):
+            col = columns[j]
+            if col in row:
+                result[i, j] = row[col]
+            else:
+                result[i, j] = onan
+
+    return result
+
+
 def fast_zip(list ndarrays):
     '''
     For zipping multiple ndarrays into an ndarray of tuples
diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py
@@ -1243,6 +1243,23 @@ def test_constructor_list_of_lists(self):
         self.assert_(com.is_integer_dtype(df['num']))
         self.assert_(df['str'].dtype == np.object_)
 
+    def test_constructor_list_of_dicts(self):
+        data = [{'a': 1.5, 'b': 3, 'c':4, 'd':6},
+                {'a': 1.5, 'b': 3, 'd':6},
+                {'a': 1.5, 'd':6},
+                {},
+                {'a': 1.5, 'b': 3, 'c':4},
+                {'b': 3, 'c':4, 'd':6}]
+
+        result = DataFrame(data)
+        expected = DataFrame.from_dict(dict(zip(range(len(data)), data)),
+                                       orient='index')
+        assert_frame_equal(result, expected.reindex(result.index))
+
+        result = DataFrame([{}])
+        expected = DataFrame([])
+        assert_frame_equal(result, expected)
+
     def test_constructor_ragged(self):
         data = {'A' : randn(10),
                 'B' : randn(8)}
@@ -3752,7 +3769,7 @@ def test_reset_index(self):
                                     deleveled2['level_1']))
 
         # exception if no name
-        self.assertRaises(Exception, self.frame.delevel)
+        self.assertRaises(Exception, self.frame.reset_index)
 
         # but this is ok
         self.frame.index.name = 'index'
diff --git a/pandas/tests/test_groupby.py b/pandas/tests/test_groupby.py
@@ -786,7 +786,7 @@ def test_cython_agg_nothing_to_agg(self):
         self.assertRaises(GroupByError, frame.groupby('a')['b'].mean)
 
     def test_grouping_attrs(self):
-        deleveled = self.mframe.delevel()
+        deleveled = self.mframe.reset_index()
         grouped = deleveled.groupby(['first', 'second'])
 
         for i, ping in enumerate(grouped.groupings):
@@ -795,7 +795,7 @@ def test_grouping_attrs(self):
 
     def test_groupby_level(self):
         frame = self.mframe
-        deleveled = frame.delevel()
+        deleveled = frame.reset_index()
 
         result0 = frame.groupby(level=0).sum()
         result1 = frame.groupby(level=1).sum()
@@ -840,7 +840,7 @@ def test_groupby_level_apply(self):
 
     def test_groupby_level_mapper(self):
         frame = self.mframe
-        deleveled = frame.delevel()
+        deleveled = frame.reset_index()
 
         mapper0 = {'foo' : 0, 'bar' : 0,
                    'baz' : 1, 'qux' : 1}
diff --git a/pandas/tests/test_multilevel.py b/pandas/tests/test_multilevel.py
@@ -249,7 +249,7 @@ def test_sortlevel(self):
         # series
         a_sorted = self.frame['A'].sortlevel(0)
         self.assertRaises(Exception,
-                          self.frame.delevel()['A'].sortlevel)
+                          self.frame.reset_index()['A'].sortlevel)
 
         # preserve names
         self.assertEquals(a_sorted.index.names, self.frame.index.names)
@@ -261,7 +261,7 @@ def test_delevel_infer_dtype(self):
                                        names=['prm0', 'prm1', 'prm2'])
         df = DataFrame(np.random.randn(8,3), columns=['A', 'B', 'C'],
                        index=index)
-        deleveled = df.delevel()
+        deleveled = df.reset_index()
         self.assert_(com.is_integer_dtype(deleveled['prm1']))
         self.assert_(com.is_float_dtype(deleveled['prm2']))
 
diff --git a/vb_suite/pandas_vb_common.py b/vb_suite/pandas_vb_common.py
@@ -2,3 +2,9 @@
 import pandas.util.testing as tm
 import random
 import numpy as np
+
+# didn't add to namespace until later
+try:
+    from pandas.core.index import MultiIndex
+except ImportError:
+    pass
diff --git a/vb_suite/reindex.py b/vb_suite/reindex.py
@@ -57,13 +57,24 @@
 
 ts = Series(np.random.randn(len(rng)), index=rng)
 ts2 = ts[::2]
+
+def pad():
+    try:
+        ts2.reindex(ts.index, method='pad')
+    except:
+        ts2.reindex(ts.index, fillMethod='pad')
+def backfill():
+    try:
+        ts2.reindex(ts.index, method='backfill')
+    except:
+        ts2.reindex(ts.index, fillMethod='backfill')
 """
 
-statement = "ts2.reindex(ts.index, method='pad')"
+statement = "pad()"
 reindex_daterange_pad = Benchmark(statement, setup,
                                   name="reindex_daterange_pad")
 
-statement = "ts2.reindex(ts.index, method='backfill')"
+statement = "backfill()"
 reindex_daterange_backfill = Benchmark(statement, setup,
                                        name="reindex_daterange_backfill")