Merge pull request pandas-dev#2985 from jreback/idxmin

jreback · jreback · commit 7b5fae96f0ee · 2013-03-06T17:54:07.000-08:00
BUG: Bug in idxmin/idxmax of ``datetime64[ns]`` Series with ``NaT`` (GH2982)
diff --git a/RELEASE.rst b/RELEASE.rst
@@ -126,6 +126,7 @@ pandas 0.11.0
 
   - Bug on in-place putmasking on an ``integer`` series that needs to be converted to ``float`` (GH2746_)
   - Bug in argsort of ``datetime64[ns]`` Series with ``NaT`` (GH2967_)
+  - Bug in idxmin/idxmax of ``datetime64[ns]`` Series with ``NaT`` (GH2982__)
 
 .. _GH622: https://github.com/pydata/pandas/issues/622
 .. _GH797: https://github.com/pydata/pandas/issues/797
@@ -147,6 +148,7 @@ pandas 0.11.0
 .. _GH2931: https://github.com/pydata/pandas/issues/2931
 .. _GH2973: https://github.com/pydata/pandas/issues/2973
 .. _GH2967: https://github.com/pydata/pandas/issues/2967
+.. _GH2982: https://github.com/pydata/pandas/issues/2982
 
 
 pandas 0.10.1
diff --git a/doc/source/io.rst b/doc/source/io.rst
@@ -1186,7 +1186,7 @@ A query is specified using the ``Term`` class under the hood.
 
 Valid terms can be created from ``dict, list, tuple, or
 string``. Objects can be embeded as values. Allowed operations are: ``<,
-<=, >, >=, =``. ``=`` will be inferred as an implicit set operation
+<=, >, >=, =, !=``. ``=`` will be inferred as an implicit set operation
 (e.g. if 2 or more values are provided). The following are all valid
 terms.
 
diff --git a/pandas/core/common.py b/pandas/core/common.py
@@ -682,19 +682,30 @@ def _infer_dtype_from_scalar(val):
 
 
 def _maybe_promote(dtype, fill_value=np.nan):
+
+    # if we passed an array here, determine the fill value by dtype
+    if isinstance(fill_value,np.ndarray):
+        if issubclass(fill_value.dtype.type, (np.datetime64,np.timedelta64)):
+            fill_value = tslib.iNaT
+        else:
+            fill_value = np.nan
+
     # returns tuple of (dtype, fill_value)
-    if issubclass(dtype.type, np.datetime64):
+    if issubclass(dtype.type, (np.datetime64,np.timedelta64)):
         # for now: refuse to upcast datetime64
         # (this is because datetime64 will not implicitly upconvert
         #  to object correctly as of numpy 1.6.1)
         if isnull(fill_value):
             fill_value = tslib.iNaT
         else:
-            try:
-                fill_value = lib.Timestamp(fill_value).value
-            except:
-                # the proper thing to do here would probably be to upcast to
-                # object (but numpy 1.6.1 doesn't do this properly)
+            if issubclass(dtype.type, np.datetime64):
+                try:
+                    fill_value = lib.Timestamp(fill_value).value
+                except:
+                    # the proper thing to do here would probably be to upcast to
+                    # object (but numpy 1.6.1 doesn't do this properly)
+                    fill_value = tslib.iNaT
+            else:
                 fill_value = tslib.iNaT
     elif is_float(fill_value):
         if issubclass(dtype.type, np.bool_):
@@ -722,24 +733,33 @@ def _maybe_promote(dtype, fill_value=np.nan):
     return dtype, fill_value
 
 
-def _maybe_upcast_putmask(result, mask, other):
+def _maybe_upcast_putmask(result, mask, other, dtype=None):
     """ a safe version of put mask that (potentially upcasts the result
         return the result and a changed flag """
     try:
         np.putmask(result, mask, other)
     except:
         # our type is wrong here, need to upcast
         if (-mask).any():
-            result, fill_value = _maybe_upcast(result, copy=True)
+            result, fill_value = _maybe_upcast(result, fill_value=other, dtype=dtype, copy=True)
             np.putmask(result, mask, other)
             return result, True
 
     return result, False
 
-def _maybe_upcast(values, fill_value=np.nan, copy=False):
+def _maybe_upcast(values, fill_value=np.nan, dtype=None, copy=False):
     """ provide explicty type promotion and coercion
-        if copy == True, then a copy is created even if no upcast is required """
-    new_dtype, fill_value = _maybe_promote(values.dtype, fill_value)
+
+        Parameters
+        ----------
+        values : the ndarray that we want to maybe upcast
+        fill_value : what we want to fill with
+        dtype : if None, then use the dtype of the values, else coerce to this type
+        copy : if True always make a copy even if no upcast is required """
+
+    if dtype is None:
+        dtype = values.dtype
+    new_dtype, fill_value = _maybe_promote(dtype, fill_value)
     if new_dtype != values.dtype:
         values = values.astype(new_dtype)
     elif copy:
@@ -915,7 +935,6 @@ def _possibly_convert_platform(values):
 
     return values
 
-
 def _possibly_cast_to_timedelta(value, coerce=True):
     """ try to cast to timedelta64 w/o coercion """
 
diff --git a/pandas/core/nanops.py b/pandas/core/nanops.py
@@ -8,6 +8,7 @@
 import pandas.lib as lib
 import pandas.algos as algos
 import pandas.hashtable as _hash
+import pandas.tslib as tslib
 
 try:
     import bottleneck as bn
@@ -69,8 +70,61 @@ def _has_infs(result):
     else:
         return np.isinf(result) or np.isneginf(result)
 
+def _get_fill_value(dtype, fill_value=None, fill_value_typ=None):
+    """ return the correct fill value for the dtype of the values """
+    if fill_value is not None:
+        return fill_value
+    if _na_ok_dtype(dtype):
+        if fill_value_typ is None:
+            return np.nan
+        else:
+            if fill_value_typ == '+inf':
+                return np.inf
+            else:
+                return -np.inf
+    else:
+        if fill_value_typ is None:
+            return tslib.iNaT
+        else:
+            if fill_value_typ == '+inf':
+                # need the max int here
+                return np.iinfo(np.int64).max
+            else:
+                return tslib.iNaT
+
+def _get_values(values, skipna, fill_value=None, fill_value_typ=None, isfinite=False, copy=True):
+    """ utility to get the values view, mask, dtype
+        if necessary copy and mask using the specified fill_value
+        copy = True will force the copy """
+    if isfinite:
+        mask = _isfinite(values)
+    else:
+        mask = isnull(values)
+
+    dtype    = values.dtype
+    dtype_ok = _na_ok_dtype(dtype)
+
+    # get our fill value (in case we need to provide an alternative dtype for it)
+    fill_value = _get_fill_value(dtype, fill_value=fill_value, fill_value_typ=fill_value_typ)
+
+    if skipna:
+        if copy:
+            values = values.copy()
+        if dtype_ok:
+            np.putmask(values, mask, fill_value)
+
+        # promote if needed
+        else:
+            values, changed = com._maybe_upcast_putmask(values, mask, fill_value)
+
+    elif copy:
+        values = values.copy()
+
+    values = _view_if_needed(values)
+    return values, mask, dtype
+
 def _isfinite(values):
-    if issubclass(values.dtype.type, np.timedelta64):
+    if issubclass(values.dtype.type, (np.timedelta64,np.datetime64)):
         return isnull(values)
     return -np.isfinite(values)
 
@@ -99,43 +153,21 @@ def _wrap_results(result,dtype):
     return result
 
 def nanany(values, axis=None, skipna=True):
-    mask = isnull(values)
-
-    if skipna:
-        values = values.copy()
-        np.putmask(values, mask, False)
+    values, mask, dtype = _get_values(values, skipna, False, copy=skipna)
     return values.any(axis)
 
-
 def nanall(values, axis=None, skipna=True):
-    mask = isnull(values)
-
-    if skipna:
-        values = values.copy()
-        np.putmask(values, mask, True)
+    values, mask, dtype = _get_values(values, skipna, True, copy=skipna)
     return values.all(axis)
 
-
 def _nansum(values, axis=None, skipna=True):
-    mask = isnull(values)
-
-    if skipna and not issubclass(values.dtype.type, np.integer):
-        values = values.copy()
-        np.putmask(values, mask, 0)
-
+    values, mask, dtype = _get_values(values, skipna, 0)
     the_sum = values.sum(axis)
     the_sum = _maybe_null_out(the_sum, axis, mask)
-
     return the_sum
 
-
 def _nanmean(values, axis=None, skipna=True):
-    mask = isnull(values)
-
-    if skipna and not issubclass(values.dtype.type, np.integer):
-        values = values.copy()
-        np.putmask(values, mask, 0)
-
+    values, mask, dtype = _get_values(values, skipna, 0)
     the_sum = _ensure_numeric(values.sum(axis))
     count = _get_counts(mask, axis)
 
@@ -186,15 +218,7 @@ def _nanvar(values, axis=None, skipna=True, ddof=1):
 
 
 def _nanmin(values, axis=None, skipna=True):
-    mask = isnull(values)
-
-    dtype = values.dtype
-
-    if skipna and _na_ok_dtype(dtype):
-        values = values.copy()
-        np.putmask(values, mask, np.inf)
-
-    values = _view_if_needed(values)
+    values, mask, dtype = _get_values(values, skipna, fill_value_typ = '+inf')
 
     # numpy 1.6.1 workaround in Python 3.x
     if (values.dtype == np.object_
@@ -218,15 +242,7 @@ def _nanmin(values, axis=None, skipna=True):
 
 
 def _nanmax(values, axis=None, skipna=True):
-    mask = isnull(values)
-
-    dtype = values.dtype
-
-    if skipna and _na_ok_dtype(dtype):
-        values = values.copy()
-        np.putmask(values, mask, -np.inf)
-
-    values = _view_if_needed(values)
+    values, mask, dtype = _get_values(values, skipna, fill_value_typ ='-inf')
 
     # numpy 1.6.1 workaround in Python 3.x
     if (values.dtype == np.object_
@@ -254,11 +270,7 @@ def nanargmax(values, axis=None, skipna=True):
     """
     Returns -1 in the NA case
     """
-    mask = _isfinite(values)
-    values = _view_if_needed(values)
-    if not issubclass(values.dtype.type, np.integer):
-        values = values.copy()
-        np.putmask(values, mask, -np.inf)
+    values, mask, dtype = _get_values(values, skipna, fill_value_typ = '-inf', isfinite=True)
     result = values.argmax(axis)
     result = _maybe_arg_null_out(result, axis, mask, skipna)
     return result
@@ -268,11 +280,7 @@ def nanargmin(values, axis=None, skipna=True):
     """
     Returns -1 in the NA case
     """
-    mask = _isfinite(values)
-    values = _view_if_needed(values)
-    if not issubclass(values.dtype.type, np.integer):
-        values = values.copy()
-        np.putmask(values, mask, np.inf)
+    values, mask, dtype = _get_values(values, skipna, fill_value_typ = '+inf', isfinite=True)
     result = values.argmin(axis)
     result = _maybe_arg_null_out(result, axis, mask, skipna)
     return result
diff --git a/pandas/core/series.py b/pandas/core/series.py
@@ -2112,13 +2112,13 @@ def argsort(self, axis=0, kind='quicksort', order=None):
         mask = isnull(values)
 
         if mask.any():
-            result = Series(-1,index=self.index,name=self.name)
+            result = Series(-1,index=self.index,name=self.name,dtype='int64')
             notmask = -mask
             result.values[notmask] = np.argsort(self.values[notmask], kind=kind)
             return result
         else:
             return Series(np.argsort(values, kind=kind), index=self.index,
-                          name=self.name)
+                          name=self.name,dtype='int64')
 
     def rank(self, method='average', na_option='keep', ascending=True):
         """
diff --git a/pandas/tests/test_series.py b/pandas/tests/test_series.py
@@ -1816,14 +1816,15 @@ def test_timedelta64_functions(self):
         result = td.idxmax()
         self.assert_(result == 2)
 
-        # with NaT (broken)
+        # GH 2982
+        # with NaT 
         td[0] = np.nan
 
-        #result = td.idxmin()
-        #self.assert_(result == 1)
+        result = td.idxmin()
+        self.assert_(result == 1)
 
-        #result = td.idxmax()
-        #self.assert_(result == 2)
+        result = td.idxmax()
+        self.assert_(result == 2)
 
         # abs
         s1 = Series(date_range('20120101',periods=3))
@@ -2065,6 +2066,16 @@ def test_idxmin(self):
         allna = self.series * nan
         self.assert_(isnull(allna.idxmin()))
 
+        # datetime64[ns]
+        from pandas import date_range
+        s = Series(date_range('20130102',periods=6))
+        result = s.idxmin()
+        self.assert_(result == 0)
+
+        s[0] = np.nan
+        result = s.idxmin()
+        self.assert_(result == 1)
+
     def test_idxmax(self):
         # test idxmax
         # _check_stat_op approach can not be used here because of isnull check.
@@ -2086,6 +2097,15 @@ def test_idxmax(self):
         allna = self.series * nan
         self.assert_(isnull(allna.idxmax()))
 
+        from pandas import date_range
+        s = Series(date_range('20130102',periods=6))
+        result = s.idxmax()
+        self.assert_(result == 5)
+
+        s[5] = np.nan
+        result = s.idxmax()
+        self.assert_(result == 4)
+
     def test_operators_corner(self):
         series = self.ts