BUG: proper type inference with list of lists passed to DataFrame constructor, from_records type-handling fixes, GH #484

wesm · wesm · commit 21bad0f6c216 · 2011-12-13T14:35:10.000-05:00
diff --git a/RELEASE.rst b/RELEASE.rst
@@ -58,6 +58,7 @@ pandas 0.6.1
     matrices (GH #189)
   - Add `margins` option to `pivot_table` for computing subgroup aggregates (GH
     #114)
+  - Add `Series.from_csv` function (PR #482)
 
 **Improvements to existing features**
 
@@ -129,6 +130,7 @@ Thanks
 - Chang She
 - Ted Square
 - Chris Uga
+- Dieter Vandenbussche
 
 pandas 0.6.0
 ============
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -207,8 +207,12 @@ def __init__(self, data=None, index=None, columns=None, dtype=None,
                 mgr = self._init_ndarray(data, index, columns, dtype=dtype,
                                          copy=copy)
         elif isinstance(data, list):
-            mgr = self._init_ndarray(data, index, columns, dtype=dtype,
-                                     copy=copy)
+            if isinstance(data[0], (list, tuple)):
+                data, columns = _list_to_sdict(data, columns)
+                mgr = self._init_dict(data, index, columns, dtype=dtype)
+            else:
+                mgr = self._init_ndarray(data, index, columns, dtype=dtype,
+                                         copy=copy)
         else:
             raise PandasError('DataFrame constructor not properly called!')
 
@@ -528,20 +532,7 @@ def from_records(cls, data, index=None, exclude=None, columns=None,
         if isinstance(data, (np.ndarray, DataFrame, dict)):
             columns, sdict = _rec_to_dict(data)
         else:
-            if isinstance(data[0], tuple):
-                content = list(lib.to_object_array_tuples(data).T)
-            else:
-                # list of lists
-                content = list(lib.to_object_array(data).T)
-
-            if columns is None:
-                columns = range(len(content))
-            else:
-                assert(len(columns) == len(content))
-
-            sdict = dict((c, lib.maybe_convert_objects(vals))
-                         for c, vals in zip(columns, content))
-            del content
+            sdict, columns = _list_to_sdict(data, columns)
 
         if exclude is None:
             exclude = set()
@@ -3547,6 +3538,22 @@ def _rec_to_dict(arr):
 
     return columns, sdict
 
+def _list_to_sdict(data, columns):
+    if isinstance(data[0], tuple):
+        content = list(lib.to_object_array_tuples(data).T)
+    else:
+        # list of lists
+        content = list(lib.to_object_array(data).T)
+
+    if columns is None:
+        columns = range(len(content))
+    else:
+        assert(len(columns) == len(content))
+
+    sdict = dict((c, lib.maybe_convert_objects(vals))
+                 for c, vals in zip(columns, content))
+    return sdict, columns
+
 def _homogenize(data, index, columns, dtype=None):
     from pandas.core.series import _sanitize_array
 
diff --git a/pandas/src/parsing.pyx b/pandas/src/parsing.pyx
@@ -85,7 +85,7 @@ def maybe_convert_numeric(ndarray[object] values, set na_values):
     for i from 0 <= i < n:
         val = values[i]
 
-        if cpython.PyFloat_Check(val):
+        if util.is_float_object(val):
             floats[i] = val
             seen_float = 1
         elif val in na_values:
@@ -144,18 +144,18 @@ def maybe_convert_objects(ndarray[object] objects):
             seen_null = 1
             objects[i] = onan
             floats[i] = fnan
-        elif cpython.PyBool_Check(val):
+        elif util.is_bool_object(val):
             seen_bool = 1
             bools[i] = val
-        elif is_integer_object(val):
+        elif util.is_integer_object(val):
             seen_int = 1
             floats[i] = <float64_t> val
             if not seen_null:
                 ints[i] = val
-        elif cpython.PyFloat_Check(val):
+        elif util.is_float_object(val):
             floats[i] = val
             seen_float = 1
-        elif not (cpython.PyString_Check(val) or cpython.PyUnicode_Check(val)):
+        elif not util.is_string_object(val):
             # this will convert Decimal objects
             try:
                 floats[i] = float(val)
@@ -173,14 +173,16 @@ def maybe_convert_objects(ndarray[object] objects):
     else:
         if seen_object:
             return objects
-        elif seen_int:
-            return ints
-        elif seen_float:
-            return floats
-        elif seen_bool:
-            return bools.view(np.bool_)
+        elif not seen_bool:
+            if seen_float:
+                return floats
+            elif seen_int:
+                return ints
         else:
-            return objects
+            if not seen_float and not seen_int:
+                return bools.view(np.bool_)
+
+        return objects
 
 convert_sql_column = maybe_convert_objects
 
diff --git a/pandas/src/tseries.pyx b/pandas/src/tseries.pyx
@@ -49,7 +49,7 @@ cdef double_t *get_double_ptr(ndarray arr):
 
     return <double_t *> arr.data
 
-from util cimport is_integer_object
+cimport util
 
 cdef extern from "math.h":
     double sqrt(double x)
diff --git a/pandas/src/util.pxd b/pandas/src/util.pxd
@@ -4,6 +4,8 @@ cimport numpy as cnp
 cdef extern from "numpy_helper.h":
     inline int is_integer_object(object)
     inline int is_float_object(object)
+    inline int is_bool_object(object)
+    inline int is_string_object(object)
     inline int assign_value_1d (ndarray, Py_ssize_t, object) except -1
 
 cpdef inline object get_value_at(ndarray arr, object loc):
diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py
@@ -1222,6 +1222,13 @@ def test_constructor_more(self):
         self.assertEqual(len(dm.columns), 2)
         self.assert_(dm.values.dtype == np.float64)
 
+    def test_constructor_list_of_lists(self):
+        # GH #484
+        l = [[1, 'a'], [2, 'b']]
+        df = DataFrame(data=l, columns=["num", "str"])
+        self.assert_(com.is_integer_dtype(df['num']))
+        self.assert_(df['str'].dtype == np.object_)
+
     def test_constructor_ragged(self):
         data = {'A' : randn(10),
                 'B' : randn(8)}
diff --git a/setup.py b/setup.py
@@ -286,7 +286,7 @@ def srcpath(name=None, suffix='.pyx', subdir='src'):
     tseries_depends = [srcpath(f, suffix='.pyx')
                        for f in tseries_depends]
 else:
-    tseries_depends = None
+    tseries_depends = []
 
 tseries_ext = Extension('pandas._tseries',
                         depends=tseries_depends + ['pandas/src/numpy_helper.h'],