CLN: change call signature of _maybe_promote (from stephenwlin branch)

jreback · jreback · commit cb56c98de37c · 2013-02-14T14:36:23.000-05:00
and _infer_dtype_from_scalar to match (both return dtype, fill_value)

Diff between 'jreback/dtypes_bug' and 'stephenwlin/dtypes_bug'

Conflicts:

	pandas/core/common.py
diff --git a/pandas/core/common.py b/pandas/core/common.py
@@ -504,7 +504,7 @@ def take_1d(arr, indexer, out=None, fill_value=np.nan):
         dtype, fill_value = arr.dtype, arr.dtype.type()
     else:
         indexer = _ensure_int64(indexer)
-        dtype = _maybe_promote(arr.dtype, fill_value)
+        dtype = _maybe_promote(arr.dtype, fill_value)[0]
         if dtype != arr.dtype:
             mask = indexer == -1
             needs_masking = mask.any()
@@ -552,7 +552,7 @@ def take_2d_multi(arr, row_idx, col_idx, fill_value=np.nan, out=None):
     else:
         col_idx = _ensure_int64(col_idx)
 
-    dtype = _maybe_promote(arr.dtype, fill_value)
+    dtype = _maybe_promote(arr.dtype, fill_value)[0]
     if dtype != arr.dtype:
         row_mask = row_idx == -1
         col_mask = col_idx == -1
@@ -588,7 +588,7 @@ def diff(arr, n, axis=0):
     n = int(n)
     dtype = arr.dtype
     if issubclass(dtype.type, np.integer):
-        dtype = np.float_
+        dtype = np.float64
     elif issubclass(dtype.type, np.bool_):
         dtype = np.object_
 
@@ -629,7 +629,7 @@ def take_fast(arr, indexer, mask, needs_masking, axis=0, out=None,
     else:
         indexer = _ensure_int64(indexer)
         if needs_masking:
-            dtype = _maybe_promote(arr.dtype, fill_value)
+            dtype = _maybe_promote(arr.dtype, fill_value)[0]
             if dtype != arr.dtype and out is not None and out.dtype != dtype:
                 raise Exception('Incompatible type for fill_value')
         else:
@@ -644,16 +644,20 @@ def take_fast(arr, indexer, mask, needs_masking, axis=0, out=None,
     take_f(arr, indexer, out=out, fill_value=fill_value)
     return out
 
+
 def _infer_dtype_from_scalar(val):
     """ interpret the dtype from a scalar, upcast floats and ints
         return the new value and the dtype """
 
+    dtype = np.object_
+
     # a 1-element ndarray
     if isinstance(val, pa.Array):
         if val.ndim != 0:
             raise ValueError("invalid ndarray passed to _infer_dtype_from_scalar")
 
-        return val.item(), val.dtype
+        dtype = val.dtype
+        val   = val.item()
 
     elif isinstance(val, basestring):
 
@@ -662,67 +666,79 @@ def _infer_dtype_from_scalar(val):
         # so this is kind of bad. Alternately we could use np.repeat
         # instead of np.empty (but then you still don't want things
         # coming out as np.str_!
-        return val, np.object_
+
+        dtype = np.object_
 
     elif isinstance(val, np.datetime64):
         # ugly hacklet
-        val = lib.Timestamp(val).value
-        return val, np.dtype('M8[ns]')
+        val   = lib.Timestamp(val).value
+        dtype = np.dtype('M8[ns]')
 
     elif is_bool(val):
-        return val, np.bool_
+        dtype = np.bool_
 
     # provide implicity upcast on scalars
     elif is_integer(val):
-            return val, np.int64
+        dtype = np.int64
+
     elif is_float(val):
-        return val, np.float64
+        dtype = np.float64
 
     elif is_complex(val):
-        return val, np.complex_
+        dtype = np.complex_
 
-    return val, np.object_
+    return dtype, val
 
 def _maybe_promote(dtype, fill_value=np.nan):
+    # returns tuple of (dtype, fill_value)
     if issubclass(dtype.type, np.datetime64):
-        # for now: refuse to upcast
+        # for now: refuse to upcast datetime64
         # (this is because datetime64 will not implicitly upconvert
         #  to object correctly as of numpy 1.6.1)
-        return dtype
+        if isnull(fill_value):
+            fill_value = tslib.iNaT
+        else:
+            try:
+                fill_value = lib.Timestamp(fill_value).value
+            except:
+                # the proper thing to do here would probably be to upcast to
+                # object (but numpy 1.6.1 doesn't do this properly)
+                fill_value = tslib.iNaT 
     elif is_float(fill_value):
         if issubclass(dtype.type, np.bool_):
-            return np.object_
+            dtype = np.object_
         elif issubclass(dtype.type, np.integer):
-            return np.float64
-        return dtype
+            dtype = np.float64
     elif is_bool(fill_value):
-        if issubclass(dtype.type, np.bool_):
-            return dtype
-        return np.object_
+        if not issubclass(dtype.type, np.bool_):
+            dtype = np.object_
     elif is_integer(fill_value):
         if issubclass(dtype.type, np.bool_):
-            return np.object_
+            dtype = np.object_
         elif issubclass(dtype.type, np.integer):
             # upcast to prevent overflow
             arr = np.asarray(fill_value)
             if arr != arr.astype(dtype):
-                return arr.dtype
-            return dtype
-        return dtype
+                dtype = arr.dtype
     elif is_complex(fill_value):
         if issubclass(dtype.type, np.bool_):
-            return np.object_
+            dtype = np.object_
         elif issubclass(dtype.type, (np.integer, np.floating)):
-            return np.complex_
-        return dtype
-    return np.object_
+            dtype = np.complex128
+    else:
+        dtype = np.object_
+    return dtype, fill_value
 
-def _maybe_upcast(values):
-    """ provide explicty type promotion and coercion """
-    new_dtype = _maybe_promote(values.dtype)
+def _maybe_upcast(values, fill_value=np.nan, copy=False):
+    """ provide explicty type promotion and coercion
+        if copy == True, then a copy is created even if no upcast is required """
+
+    new_dtype, fill_value = _maybe_promote(values.dtype, fill_value)
     if new_dtype != values.dtype:
         values = values.astype(new_dtype)
-    return values
+    elif copy:
+        values = values.copy()
+    return values, fill_value
 
 def _possibly_cast_item(obj, item, dtype):
     chunk = obj[item]
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -390,12 +390,8 @@ def __init__(self, data=None, index=None, columns=None, dtype=None,
             mgr = self._init_dict(data, index, columns, dtype=dtype)
         elif isinstance(data, ma.MaskedArray):
             mask = ma.getmaskarray(data)
-            datacopy = ma.copy(data)
-            if issubclass(data.dtype.type, np.datetime64):
-                datacopy[mask] = tslib.iNaT
-            else:
-                datacopy = com._maybe_upcast(datacopy)
-                datacopy[mask] = NA
+            datacopy, fill_value = com._maybe_upcast(data, copy=True)
+            datacopy[mask] = fill_value
             mgr = self._init_ndarray(datacopy, index, columns, dtype=dtype,
                                      copy=copy)
         elif isinstance(data, np.ndarray):
@@ -437,7 +433,7 @@ def __init__(self, data=None, index=None, columns=None, dtype=None,
                 if isinstance(data, basestring) and dtype is None:
                     dtype = np.object_
                 if dtype is None:
-                    data, dtype = _infer_dtype_from_scalar(data)
+                    dtype, data = _infer_dtype_from_scalar(data)
 
                 values = np.empty((len(index), len(columns)), dtype=dtype)
                 values.fill(data)
@@ -1878,7 +1874,7 @@ def set_value(self, index, col, value):
             new_index, new_columns = self._expand_axes((index, col))
             result = self.reindex(index=new_index, columns=new_columns,
                                   copy=False)
-            value, likely_dtype = _infer_dtype_from_scalar(value)
+            likely_dtype, value = _infer_dtype_from_scalar(value)
 
             made_bigger = not np.array_equal(new_columns, self.columns)
 
@@ -2208,7 +2204,7 @@ def _sanitize_column(self, key, value):
                 existing_piece = self[key]
 
                 # upcast the scalar
-                value, dtype = _infer_dtype_from_scalar(value)
+                dtype, value = _infer_dtype_from_scalar(value)
 
                 # transpose hack
                 if isinstance(existing_piece, DataFrame):
@@ -2217,16 +2213,11 @@ def _sanitize_column(self, key, value):
                 else:
                     value = np.repeat(value, len(self.index))
 
-                    # special case for now (promotion)
-                    if (com.is_float_dtype(existing_piece) and
-                            com.is_integer_dtype(value)):
-                        dtype = np.float64
-                        
                 value = value.astype(dtype)
 
             else:
                 # upcast the scalar
-                value, dtype = _infer_dtype_from_scalar(value)
+                dtype, value = _infer_dtype_from_scalar(value)
                 value = np.array(np.repeat(value, len(self.index)), dtype=dtype)
 
             value = com._possibly_cast_to_datetime(value, dtype)
diff --git a/pandas/core/internals.py b/pandas/core/internals.py
@@ -377,11 +377,11 @@ def shift(self, indexer, periods):
         new_values = self.values.take(indexer, axis=1)
         # convert integer to float if necessary. need to do a lot more than
         # that, handle boolean etc also
-        new_values = com._maybe_upcast(new_values)
+        new_values, fill_value = com._maybe_upcast(new_values)
         if periods > 0:
-            new_values[:, :periods] = np.nan
+            new_values[:, :periods] = fill_value
         else:
-            new_values[:, periods:] = np.nan
+            new_values[:, periods:] = fill_value
         return make_block(new_values, self.items, self.ref_items)
 
     def where(self, func, other, cond = None, raise_on_error = True, try_cast = False):
@@ -1412,7 +1412,7 @@ def _make_na_block(self, items, ref_items, fill_value=np.nan):
         block_shape = list(self.shape)
         block_shape[0] = len(items)
 
-        fill_value, dtype = com._infer_dtype_from_scalar(fill_value)
+        dtype, fill_value = com._infer_dtype_from_scalar(fill_value)
         block_values = np.empty(block_shape, dtype=dtype)
         block_values.fill(fill_value)
         na_block = make_block(block_values, items, ref_items)
diff --git a/pandas/core/panel.py b/pandas/core/panel.py
@@ -658,7 +658,7 @@ def set_value(self, *args):
             d = self._construct_axes_dict_from(self, axes, copy=False)
             result = self.reindex(**d)
             args  = list(args)
-            args[-1], likely_dtype = _infer_dtype_from_scalar(args[-1])
+            likely_dtype, args[-1] = _infer_dtype_from_scalar(args[-1])
             made_bigger = not np.array_equal(
                 axes[0], getattr(self, self._info_axis))
             # how to make this logic simpler?
@@ -693,7 +693,7 @@ def __setitem__(self, key, value):
             assert(value.shape == shape[1:])
             mat = np.asarray(value)
         elif np.isscalar(value):
-            value, dtype = _infer_dtype_from_scalar(value)
+            dtype, value = _infer_dtype_from_scalar(value)
             mat = np.empty(shape[1:], dtype=dtype)
             mat.fill(value)
         else:
diff --git a/pandas/core/reshape.py b/pandas/core/reshape.py
@@ -149,8 +149,9 @@ def get_new_values(self):
         stride = values.shape[1]
         result_width = width * stride
 
-        new_values = np.empty((length, result_width), dtype=_maybe_promote(values.dtype))
-        new_values.fill(np.nan)
+        dtype, fill_value = _maybe_promote(values.dtype)
+        new_values = np.empty((length, result_width), dtype=dtype)
+        new_values.fill(fill_value)
         new_mask = np.zeros((length, result_width), dtype=bool)
 
         # is there a simpler / faster way of doing this?
@@ -773,12 +774,12 @@ def block2d_to_blocknd(values, items, shape, labels, ref_items=None):
     mask = np.zeros(np.prod(shape), dtype=bool)
     mask.put(selector, True)
 
-    pvalues = np.empty(panel_shape, dtype=values.dtype)
-    if not issubclass(pvalues.dtype.type, (np.integer, np.bool_)):
-        pvalues.fill(np.nan)
-    elif not mask.all():
-        pvalues = _maybe_upcast(pvalues)
-        pvalues.fill(np.nan)
+    if mask.all():
+        pvalues = np.empty(panel_shape, dtype=values.dtype)
+    else:
+        dtype, fill_value = _maybe_promote(values.dtype)
+        pvalues = np.empty(panel_shape, dtype=dtype)
+        pvalues.fill(fill_value)
 
     values = values
     for i in xrange(len(items)):
diff --git a/pandas/core/series.py b/pandas/core/series.py
@@ -2818,14 +2818,15 @@ def _get_values():
             return values
 
         if offset is None:
-            new_values = pa.empty(len(self), dtype=_maybe_promote(self.dtype))
+            dtype, fill_value = _maybe_promote(self.dtype)
+            new_values = pa.empty(len(self), dtype=dtype)
 
             if periods > 0:
                 new_values[periods:] = self.values[:-periods]
-                new_values[:periods] = nan
+                new_values[:periods] = fill_value
             elif periods < 0:
                 new_values[:periods] = self.values[-periods:]
-                new_values[periods:] = nan
+                new_values[periods:] = fill_value
 
             return Series(new_values, index=self.index, name=self.name)
         elif isinstance(self.index, PeriodIndex):
@@ -3129,7 +3130,7 @@ def _try_cast(arr):
 
             # figure out the dtype from the value (upcast if necessary)
             if dtype is None:
-                value, dtype = _infer_dtype_from_scalar(value)
+                dtype, value = _infer_dtype_from_scalar(value)
             else:
                 # need to possibly convert the value here
                 value = com._possibly_cast_to_datetime(value, dtype)
diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py
@@ -441,8 +441,10 @@ def test_setitem_cast(self):
         self.assert_(self.frame['D'].dtype == np.int64)
 
         # #669, should not cast?
+        # this is now set to int64, which means a replacement of the column to
+        # the value dtype (and nothing to do with the existing dtype)
         self.frame['B'] = 0
-        self.assert_(self.frame['B'].dtype == np.float64)
+        self.assert_(self.frame['B'].dtype == np.int64)
 
         # cast if pass array of course
         self.frame['B'] = np.arange(len(self.frame))