BUG: rework object inferernce with NaN as the first element in an array

jreback · jreback · commit 4392ae38f3b8 · 2014-01-25T15:43:29.000-05:00
BUG: convert datetime like better from list-of-lists
BUG: make rank of M8 work via object algos
diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py
@@ -154,7 +154,7 @@ def factorize(values, sort=False, order=None, na_sentinel=-1):
         uniques = uniques.take(sorter)
 
     if is_datetime:
-        uniques = uniques.view('M8[ns]')
+        uniques = uniques.astype('M8[ns]')
     if isinstance(values, PeriodIndex):
         uniques = PeriodIndex(ordinal=uniques, freq=values.freq)
 
@@ -279,6 +279,7 @@ def rank(values, axis=0, method='average', na_option='keep',
         f, values = _get_data_algo(values, _rank2d_functions)
         ranks = f(values, axis=axis, ties_method=method,
                   ascending=ascending, na_option=na_option)
+
     return ranks
 
 
@@ -364,12 +365,22 @@ def _interpolate(a, b, fraction):
 
 
 def _get_data_algo(values, func_map):
+    mask = None
     if com.is_float_dtype(values):
         f = func_map['float64']
         values = com._ensure_float64(values)
     elif com.is_datetime64_dtype(values):
-        f = func_map['int64']
-        values = values.view('i8')
+
+        # if we have NaT, punt to object dtype
+        mask = com.isnull(values)
+        if mask.ravel().any():
+            f = func_map['generic']
+            values = com._ensure_object(values)
+            values[mask] = np.nan
+        else:
+            f = func_map['int64']
+            values = values.view('i8')
+
     elif com.is_integer_dtype(values):
         f = func_map['int64']
         values = com._ensure_int64(values)
diff --git a/pandas/core/common.py b/pandas/core/common.py
@@ -40,7 +40,7 @@ class AmbiguousIndexError(PandasError, KeyError):
     pass
 
 
-_POSSIBLY_CAST_DTYPES = set([np.dtype(t)
+_POSSIBLY_CAST_DTYPES = set([np.dtype(t).name
                              for t in ['M8[ns]', '>M8[ns]', '<M8[ns]',
                                        'm8[ns]', '>m8[ns]', '<m8[ns]',
                                        'O', 'int8',
@@ -1612,7 +1612,7 @@ def _possibly_convert_objects(values, convert_dates=True,
 
 
 def _possibly_castable(arr):
-    return arr.dtype not in _POSSIBLY_CAST_DTYPES
+    return arr.dtype.name not in _POSSIBLY_CAST_DTYPES
 
 
 def _possibly_convert_platform(values):
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -4696,9 +4696,14 @@ def _convert_object_array(content, columns, coerce_float=False, dtype=None):
             raise AssertionError('%d columns passed, passed data had %s '
                                  'columns' % (len(columns), len(content)))
 
-    arrays = [lib.maybe_convert_objects(arr, try_float=coerce_float)
-              if dtype != object and dtype != np.object else arr
-              for arr in content]
+    # provide soft conversion of object dtypes
+    def convert(arr):
+        if dtype != object and dtype != np.object:
+            arr = lib.maybe_convert_objects(arr, try_float=coerce_float)
+            arr = com._possibly_cast_to_datetime(arr, dtype)
+        return arr
+
+    arrays = [ convert(arr) for arr in content ]
 
     return arrays, columns
 
diff --git a/pandas/src/inference.pyx b/pandas/src/inference.pyx
@@ -56,7 +56,14 @@ def infer_dtype(object _values):
     if n == 0:
         return 'empty'
 
-    val = util.get_value_1d(values, 0)
+    # make contiguous
+    values = values.ravel()
+
+    # try to use a valid value
+    for i in range(n):
+       val = util.get_value_1d(values, i)
+       if not is_null_datetimelike(val):
+           break
 
     if util.is_datetime64_object(val) or val is NaT:
         if is_datetime64_array(values):
diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py
@@ -10489,6 +10489,8 @@ def test_rank2(self):
                 [datetime(2000, 1, 2), datetime(2000, 1, 3),
                  datetime(2000, 1, 1)]]
         df = DataFrame(data)
+
+        # check the rank
         expected = DataFrame([[2., nan, 1.],
                               [2., 3., 1.]])
         result = df.rank(1, numeric_only=False)
@@ -10497,14 +10499,6 @@ def test_rank2(self):
         # mixed-type frames
         self.mixed_frame['datetime'] = datetime.now()
         self.mixed_frame['timedelta'] = timedelta(days=1,seconds=1)
-        self.assert_(self.mixed_frame['datetime'].dtype == 'M8[ns]')
-        self.assert_(self.mixed_frame['timedelta'].dtype == 'm8[ns]')
-        result = self.mixed_frame.get_dtype_counts().order()
-        expected = Series({ 'float64' : 4,
-                            'object' : 1,
-                            'datetime64[ns]' : 1,
-                            'timedelta64[ns]' : 1}).order()
-        assert_series_equal(result,expected)
 
         result = self.mixed_frame.rank(1)
         expected = self.mixed_frame.rank(1, numeric_only=True)
@@ -11097,6 +11091,31 @@ def test_constructor_with_convert(self):
                                       None], np.object_))
         assert_series_equal(result, expected)
 
+    def test_construction_with_mixed(self):
+        # test construction edge cases with mixed types
+
+        # f7u12, this does not work without extensive workaround
+        data = [[datetime(2001, 1, 5), nan, datetime(2001, 1, 2)],
+                [datetime(2000, 1, 2), datetime(2000, 1, 3),
+                 datetime(2000, 1, 1)]]
+        df = DataFrame(data)
+
+        # check dtypes
+        result = df.get_dtype_counts().order()
+        expected = Series({ 'datetime64[ns]' : 3 })
+
+        # mixed-type frames
+        self.mixed_frame['datetime'] = datetime.now()
+        self.mixed_frame['timedelta'] = timedelta(days=1,seconds=1)
+        self.assert_(self.mixed_frame['datetime'].dtype == 'M8[ns]')
+        self.assert_(self.mixed_frame['timedelta'].dtype == 'm8[ns]')
+        result = self.mixed_frame.get_dtype_counts().order()
+        expected = Series({ 'float64' : 4,
+                            'object' : 1,
+                            'datetime64[ns]' : 1,
+                            'timedelta64[ns]' : 1}).order()
+        assert_series_equal(result,expected)
+
     def test_constructor_frame_copy(self):
         cop = DataFrame(self.frame, copy=True)
         cop['A'] = 5
diff --git a/pandas/tests/test_index.py b/pandas/tests/test_index.py
@@ -14,6 +14,7 @@
 
 from pandas.core.index import (Index, Float64Index, Int64Index, MultiIndex,
                                InvalidIndexError)
+from pandas.tseries.index import DatetimeIndex
 from pandas.core.frame import DataFrame
 from pandas.core.series import Series
 from pandas.util.testing import (assert_almost_equal, assertRaisesRegexp,
@@ -32,6 +33,9 @@
 
 from pandas import _np_version_under1p7
 
+def _skip_if_need_numpy_1_7():
+    if _np_version_under1p7:
+        raise nose.SkipTest('numpy >= 1.7 required')
 
 class TestIndex(tm.TestCase):
     _multiprocess_can_split_ = True
@@ -236,12 +240,7 @@ def test_asof(self):
         tm.assert_isinstance(self.dateIndex.asof(d), Timestamp)
 
     def test_nanosecond_index_access(self):
-        if _np_version_under1p7:
-            import nose
-
-            raise nose.SkipTest('numpy >= 1.7 required')
-
-        from pandas import Series, Timestamp, DatetimeIndex
+        _skip_if_need_numpy_1_7()
 
         s = Series([Timestamp('20130101')]).values.view('i8')[0]
         r = DatetimeIndex([s + 50 + i for i in range(100)])
@@ -1607,11 +1606,12 @@ def test_get_level_values_na(self):
         expected = ['a', np.nan, 1]
         assert_array_equal(values.values, expected)
 
-        arrays = [['a', 'b', 'b'], pd.DatetimeIndex([0, 1, pd.NaT])]
-        index = pd.MultiIndex.from_arrays(arrays)
-        values = index.get_level_values(1)
-        expected = pd.DatetimeIndex([0, 1, pd.NaT])
-        assert_array_equal(values.values, expected.values)
+        if not _np_version_under1p7:
+            arrays = [['a', 'b', 'b'], pd.DatetimeIndex([0, 1, pd.NaT])]
+            index = pd.MultiIndex.from_arrays(arrays)
+            values = index.get_level_values(1)
+            expected = pd.DatetimeIndex([0, 1, pd.NaT])
+            assert_array_equal(values.values, expected.values)
 
         arrays = [[], []]
         index = pd.MultiIndex.from_arrays(arrays)