BUG: DataFrame can handle lists of tuples just like Series, a bit of refactoring for code reuse. GH #293

wesm · wesm · commit e63cbd7e52a4 · 2011-11-04T16:02:39.000-04:00
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -3199,6 +3199,8 @@ def _rec_to_dict(arr):
     return columns, sdict
 
 def _homogenize(data, index, columns, dtype=None):
+    from pandas.core.series import _sanitize_array
+
     homogenized = {}
 
     if dtype is not None:
@@ -3225,23 +3227,9 @@ def _homogenize(data, index, columns, dtype=None):
         else:
             if isinstance(v, dict):
                 v = [v.get(i, nan) for i in index]
-            elif np.isscalar(v):
-                _v = np.empty(len(index), dtype=_infer_dtype(v))
-                _v.fill(v)
-                v = _v
-            else:
-                assert(len(v) == len(index))
 
-            # only *attempt* to cast to dtype
-            try:
-                arr = np.asarray(v, dtype=dtype)
-
-                # prevent NumPy from casting things to string when it shouldn't
-                if issubclass(arr.dtype.type, basestring):
-                    arr = np.array(v, dtype=object, copy=False)
-                v = arr
-            except Exception:
-                v = np.asarray(v)
+            v = _sanitize_array(v, index, dtype=dtype, copy=False,
+                                raise_cast_failure=False)
 
         homogenized[k] = v
 
diff --git a/pandas/core/series.py b/pandas/core/series.py
@@ -104,49 +104,15 @@ def __new__(cls, data, index=None, dtype=None, name=None, copy=False):
                 index = Index(sorted(data.keys()))
             data = [data.get(idx, np.nan) for idx in index]
 
-        try:
-            subarr = np.array(data, dtype=dtype, copy=copy)
-        except ValueError:
-            if dtype:
-                raise
-            else:  # pragma: no cover
-                subarr = np.array(data, dtype=object)
-
-        if subarr.ndim == 0:
-            if isinstance(data, list):  # pragma: no cover
-                subarr = np.array(data, dtype=object)
-            elif index is not None:
-                value = data
-
-                # If we create an empty array using a string to infer
-                # the dtype, NumPy will only allocate one character per entry
-                # so this is kind of bad. Alternately we could use np.repeat
-                # instead of np.empty (but then you still don't want things
-                # coming out as np.str_!
-                if isinstance(value, basestring) and dtype is None:
-                    dtype = np.object_
-
-                if dtype is None:
-                    subarr = np.empty(len(index), dtype=type(value))
-                else:
-                    subarr = np.empty(len(index), dtype=dtype)
-                subarr.fill(value)
-            else:
-                return subarr.item()
-        elif subarr.ndim > 1:
-            if isinstance(data, np.ndarray):
-                raise Exception('Data must be 1-dimensional')
-            else:
-                subarr = _asarray_tuplesafe(data, dtype=dtype)
+        subarr = _sanitize_array(data, index, dtype, copy,
+                                 raise_cast_failure=True)
+
+        if not isinstance(subarr, np.ndarray):
+            return subarr
 
         if index is None:
             index = _default_index(len(subarr))
 
-        # This is to prevent mixed-type Series getting all casted to
-        # NumPy string type, e.g. NaN --> '-1#IND'.
-        if issubclass(subarr.dtype.type, basestring):
-            subarr = np.array(data, dtype=object, copy=copy)
-
         # Change the class of the array to be the subclass type.
         subarr = subarr.view(cls)
         subarr.index = index
@@ -2001,6 +1967,50 @@ def remove_na(arr):
     return arr[notnull(arr)]
 
 
+def _sanitize_array(data, index, dtype=None, copy=False,
+                    raise_cast_failure=False):
+    try:
+        subarr = np.array(data, dtype=dtype, copy=copy)
+    except (ValueError, TypeError):
+        if dtype and raise_cast_failure:
+            raise
+        else:  # pragma: no cover
+            subarr = np.array(data, dtype=object)
+
+    if subarr.ndim == 0:
+        if isinstance(data, list):  # pragma: no cover
+            subarr = np.array(data, dtype=object)
+        elif index is not None:
+            value = data
+
+            # If we create an empty array using a string to infer
+            # the dtype, NumPy will only allocate one character per entry
+            # so this is kind of bad. Alternately we could use np.repeat
+            # instead of np.empty (but then you still don't want things
+            # coming out as np.str_!
+            if isinstance(value, basestring) and dtype is None:
+                dtype = np.object_
+
+            if dtype is None:
+                subarr = np.empty(len(index), dtype=type(value))
+            else:
+                subarr = np.empty(len(index), dtype=dtype)
+            subarr.fill(value)
+        else:
+            return subarr.item()
+    elif subarr.ndim > 1:
+        if isinstance(data, np.ndarray):
+            raise Exception('Data must be 1-dimensional')
+        else:
+            subarr = _asarray_tuplesafe(data, dtype=dtype)
+
+    # This is to prevent mixed-type Series getting all casted to
+    # NumPy string type, e.g. NaN --> '-1#IND'.
+    if issubclass(subarr.dtype.type, basestring):
+        subarr = np.array(data, dtype=object, copy=copy)
+
+    return subarr
+
 def _get_rename_function(mapper):
     if isinstance(mapper, (dict, Series)):
         def f(x):
diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py
@@ -1119,6 +1119,11 @@ def test_constructor_mixed_dict_and_Series(self):
         result = DataFrame(data)
         self.assert_(result.index.is_monotonic)
 
+    def test_constructor_tuples(self):
+        result = DataFrame({'A': [(1, 2), (3, 4)]})
+        expected = DataFrame({'A': Series([(1, 2), (3, 4)])})
+        assert_frame_equal(result, expected)
+
     def test_astype(self):
         casted = self.frame.astype(int)
         expected = DataFrame(self.frame.values.astype(int),