PERF: add support for NaT in hashtable factorizers, improving Categorical construction with NaT

jreback · jreback · commit 81bb972259d2 · 2016-01-25T10:28:59.000-05:00
closes #12077 ``` before after ratio [1330b9f] [404911c6] 17.25ms 1.21ms 0.07 categoricals.categorical_con structor_with_datetimes.time_datetimes_with_nat ``` Author: Jeff Reback <jeff@reback.net> Closes #12128 from jreback/cat_perf and squashes the following commits: ca6e471 [Jeff Reback] rank functions now handle int64 and missing values e1385d8 [Jeff Reback] PERF: add support for NaT in hashtable factorizers, improving Categorical construction with NaT, #12077
diff --git a/asv_bench/benchmarks/categoricals.py b/asv_bench/benchmarks/categoricals.py
@@ -46,6 +46,22 @@ def time_fastpath(self):
         Categorical(self.codes, self.cat_idx, fastpath=True)
 
 
+class categorical_constructor_with_datetimes(object):
+    goal_time = 0.2
+
+    def setup(self):
+        self.datetimes = pd.Series(pd.date_range(
+            '1995-01-01 00:00:00', periods=10000, freq='s'))
+
+    def time_datetimes(self):
+        Categorical(self.datetimes)
+
+    def time_datetimes_with_nat(self):
+        t = self.datetimes
+        t.iloc[-1] = pd.NaT
+        Categorical(t)
+
+
 class categorical_rendering(object):
     goal_time = 3e-3
 
diff --git a/doc/source/whatsnew/v0.18.0.txt b/doc/source/whatsnew/v0.18.0.txt
@@ -458,7 +458,7 @@ Performance Improvements
 - Improved huge ``DatetimeIndex``, ``PeriodIndex`` and ``TimedeltaIndex``'s ops performance including ``NaT`` (:issue:`10277`)
 - Improved performance of ``pandas.concat`` (:issue:`11958`)
 - Improved performance of ``StataReader`` (:issue:`11591`)
-
+- Improved performance in construction of ``Categoricals`` with Series of datetimes containing ``NaT`` (:issue:`12077`)
 
 
 
@@ -481,6 +481,7 @@ Bug Fixes
 - Bug in vectorized ``DateOffset`` when ``n`` parameter is ``0`` (:issue:`11370`)
 - Compat for numpy 1.11 w.r.t. ``NaT`` comparison changes (:issue:`12049`)
 - Bug in ``read_csv`` when reading from a ``StringIO`` in threads (:issue:`11790`)
+- Bug in not treating ``NaT`` as a missing value in datetimelikes when factorizing & with ``Categoricals`` (:issue:`12077`)
 
 
 
diff --git a/pandas/algos.pyx b/pandas/algos.pyx
@@ -226,14 +226,27 @@ def rank_1d_int64(object in_arr, ties_method='average', ascending=True,
         ndarray[int64_t] sorted_data, values
         ndarray[float64_t] ranks
         ndarray[int64_t] argsorted
-        int64_t val
+        int64_t val, nan_value
         float64_t sum_ranks = 0
+        bint keep_na
         int tiebreak = 0
         float count = 0.0
     tiebreak = tiebreakers[ties_method]
 
+    keep_na = na_option == 'keep'
+
     values = np.asarray(in_arr)
 
+    if ascending ^ (na_option == 'top'):
+        nan_value = np.iinfo('int64').max
+    else:
+        nan_value = np.iinfo('int64').min
+
+    # unlike floats, which have np.inf, -np.inf, and np.nan
+    # ints do not
+    mask = values == iNaT
+    np.putmask(values, mask, nan_value)
+
     n = len(values)
     ranks = np.empty(n, dtype='f8')
 
@@ -256,6 +269,9 @@ def rank_1d_int64(object in_arr, ties_method='average', ascending=True,
         sum_ranks += i + 1
         dups += 1
         val = sorted_data[i]
+        if (val == nan_value) and keep_na:
+            ranks[argsorted[i]] = nan
+            continue
         count += 1.0
         if i == n - 1 or fabs(sorted_data[i + 1] - val) > 0:
             if tiebreak == TIEBREAK_AVERAGE:
@@ -387,16 +403,30 @@ def rank_2d_int64(object in_arr, axis=0, ties_method='average',
         ndarray[float64_t, ndim=2] ranks
         ndarray[int64_t, ndim=2] argsorted
         ndarray[int64_t, ndim=2, cast=True] values
-        int64_t val
+        int64_t val, nan_value
         float64_t sum_ranks = 0
+        bint keep_na = 0
         int tiebreak = 0
         float count = 0.0
     tiebreak = tiebreakers[ties_method]
 
+    keep_na = na_option == 'keep'
+
+    in_arr = np.asarray(in_arr)
+
     if axis == 0:
-        values = np.asarray(in_arr).T
+        values = in_arr.T.copy()
+    else:
+        values = in_arr.copy()
+
+    if ascending ^ (na_option == 'top'):
+        nan_value = np.iinfo('int64').max
     else:
-        values = np.asarray(in_arr)
+        nan_value = np.iinfo('int64').min
+
+    # unlike floats, which have np.inf, -np.inf, and np.nan
+    # ints do not
+    np.putmask(values, values == iNaT, nan_value)
 
     n, k = (<object> values).shape
     ranks = np.empty((n, k), dtype='f8')
@@ -423,6 +453,9 @@ def rank_2d_int64(object in_arr, axis=0, ties_method='average',
             sum_ranks += j + 1
             dups += 1
             val = values[i, j]
+            if val == nan_value and keep_na:
+                ranks[i, argsorted[i, j]] = nan
+                continue
             count += 1.0
             if j == k - 1 or fabs(values[i, j + 1] - val) > FP_ERR:
                 if tiebreak == TIEBREAK_AVERAGE:
diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py
@@ -11,6 +11,7 @@
 import pandas.algos as algos
 import pandas.hashtable as htable
 from pandas.compat import string_types
+from pandas.tslib import iNaT
 
 
 def match(to_match, values, na_sentinel=-1):
@@ -182,17 +183,23 @@ def factorize(values, sort=False, order=None, na_sentinel=-1, size_hint=None):
               "https://github.com/pydata/pandas/issues/6926"
         warn(msg, FutureWarning, stacklevel=2)
 
-    from pandas.core.index import Index
-    from pandas.core.series import Series
+    from pandas import Index, Series, DatetimeIndex
+
     vals = np.asarray(values)
 
+    # localize to UTC
+    is_datetimetz = com.is_datetimetz(values)
+    if is_datetimetz:
+        values = DatetimeIndex(values)
+        vals = values.tz_localize(None)
+
     is_datetime = com.is_datetime64_dtype(vals)
     is_timedelta = com.is_timedelta64_dtype(vals)
     (hash_klass, vec_klass), vals = _get_data_algo(vals, _hashtables)
 
     table = hash_klass(size_hint or len(vals))
     uniques = vec_klass()
-    labels = table.get_labels(vals, uniques, 0, na_sentinel)
+    labels = table.get_labels(vals, uniques, 0, na_sentinel, True)
 
     labels = com._ensure_platform_int(labels)
 
@@ -224,7 +231,12 @@ def factorize(values, sort=False, order=None, na_sentinel=-1, size_hint=None):
 
         uniques = uniques.take(sorter)
 
-    if is_datetime:
+    if is_datetimetz:
+
+        # reset tz
+        uniques = DatetimeIndex(uniques.astype('M8[ns]')).tz_localize(
+            values.tz)
+    elif is_datetime:
         uniques = uniques.astype('M8[ns]')
     elif is_timedelta:
         uniques = uniques.astype('m8[ns]')
@@ -296,7 +308,6 @@ def value_counts(values, sort=True, ascending=False, normalize=False,
             keys, counts = htable.value_count_scalar64(values, dropna)
 
             if dropna:
-                from pandas.tslib import iNaT
                 msk = keys != iNaT
                 keys, counts = keys[msk], counts[msk]
 
@@ -478,22 +489,13 @@ def _interpolate(a, b, fraction):
 
 
 def _get_data_algo(values, func_map):
-    mask = None
     if com.is_float_dtype(values):
         f = func_map['float64']
         values = com._ensure_float64(values)
 
     elif com.needs_i8_conversion(values):
-
-        # if we have NaT, punt to object dtype
-        mask = com.isnull(values)
-        if mask.ravel().any():
-            f = func_map['generic']
-            values = com._ensure_object(values)
-            values[mask] = np.nan
-        else:
-            f = func_map['int64']
-            values = values.view('i8')
+        f = func_map['int64']
+        values = values.view('i8')
 
     elif com.is_integer_dtype(values):
         f = func_map['int64']
diff --git a/pandas/core/categorical.py b/pandas/core/categorical.py
@@ -257,7 +257,7 @@ def __init__(self, values, categories=None, ordered=False, name=None,
                 categories = values.categories
             values = values.__array__()
 
-        elif isinstance(values, ABCIndexClass):
+        elif isinstance(values, (ABCIndexClass, ABCSeries)):
             pass
 
         else:
@@ -1177,7 +1177,7 @@ def get_values(self):
         """
         # if we are a datetime and period index, return Index to keep metadata
         if com.is_datetimelike(self.categories):
-            return self.categories.take(self._codes)
+            return self.categories.take(self._codes, fill_value=np.nan)
         return np.array(self)
 
     def check_for_ordered(self, op):
diff --git a/pandas/hashtable.pyx b/pandas/hashtable.pyx
@@ -377,12 +377,13 @@ cdef class Int64HashTable(HashTable):
 
     def factorize(self, ndarray[object] values):
         reverse = {}
-        labels = self.get_labels(values, reverse, 0)
+        labels = self.get_labels(values, reverse, 0, 0)
         return reverse, labels
 
     @cython.boundscheck(False)
     def get_labels(self, int64_t[:] values, Int64Vector uniques,
-                   Py_ssize_t count_prior, Py_ssize_t na_sentinel):
+                   Py_ssize_t count_prior, Py_ssize_t na_sentinel,
+                   bint check_null=True):
         cdef:
             Py_ssize_t i, n = len(values)
             int64_t[:] labels
@@ -399,6 +400,11 @@ cdef class Int64HashTable(HashTable):
             for i in range(n):
                 val = values[i]
                 k = kh_get_int64(self.table, val)
+
+                if check_null and val == iNaT:
+                    labels[i] = na_sentinel
+                    continue
+
                 if k != self.table.n_buckets:
                     idx = self.table.vals[k]
                     labels[i] = idx
@@ -525,13 +531,14 @@ cdef class Float64HashTable(HashTable):
 
     def factorize(self, float64_t[:] values):
         uniques = Float64Vector()
-        labels = self.get_labels(values, uniques, 0, -1)
+        labels = self.get_labels(values, uniques, 0, -1, 1)
         return uniques.to_array(), labels
 
     @cython.boundscheck(False)
     def get_labels(self, float64_t[:] values,
-                     Float64Vector uniques,
-                     Py_ssize_t count_prior, int64_t na_sentinel):
+                   Float64Vector uniques,
+                   Py_ssize_t count_prior, int64_t na_sentinel,
+                   bint check_null=True):
         cdef:
             Py_ssize_t i, n = len(values)
             int64_t[:] labels
@@ -548,7 +555,7 @@ cdef class Float64HashTable(HashTable):
             for i in range(n):
                 val = values[i]
 
-                if val != val:
+                if check_null and val != val:
                     labels[i] = na_sentinel
                     continue
 
@@ -762,7 +769,8 @@ cdef class PyObjectHashTable(HashTable):
         return uniques.to_array()
 
     def get_labels(self, ndarray[object] values, ObjectVector uniques,
-                     Py_ssize_t count_prior, int64_t na_sentinel):
+                   Py_ssize_t count_prior, int64_t na_sentinel,
+                   bint check_null=True):
         cdef:
             Py_ssize_t i, n = len(values)
             int64_t[:] labels
@@ -777,7 +785,7 @@ cdef class PyObjectHashTable(HashTable):
             val = values[i]
             hash(val)
 
-            if val != val or val is None:
+            if check_null and val != val or val is None:
                 labels[i] = na_sentinel
                 continue
 
@@ -808,14 +816,15 @@ cdef class Factorizer:
     def get_count(self):
         return self.count
 
-    def factorize(self, ndarray[object] values, sort=False, na_sentinel=-1):
+    def factorize(self, ndarray[object] values, sort=False, na_sentinel=-1,
+                  check_null=True):
         """
         Factorize values with nans replaced by na_sentinel
         >>> factorize(np.array([1,2,np.nan], dtype='O'), na_sentinel=20)
         array([ 0,  1, 20])
         """
         labels = self.table.get_labels(values, self.uniques,
-                                       self.count, na_sentinel)
+                                       self.count, na_sentinel, check_null)
         mask = (labels == na_sentinel)
         # sort on
         if sort:
@@ -848,9 +857,10 @@ cdef class Int64Factorizer:
         return self.count
 
     def factorize(self, int64_t[:] values, sort=False,
-                  na_sentinel=-1):
+                  na_sentinel=-1, check_null=True):
         labels = self.table.get_labels(values, self.uniques,
-                                       self.count, na_sentinel)
+                                       self.count, na_sentinel,
+                                       check_null)
 
         # sort on
         if sort:
diff --git a/pandas/tests/frame/test_analytics.py b/pandas/tests/frame/test_analytics.py
@@ -787,7 +787,12 @@ def test_rank2(self):
         # check the rank
         expected = DataFrame([[2., nan, 1.],
                               [2., 3., 1.]])
-        result = df.rank(1, numeric_only=False)
+        result = df.rank(1, numeric_only=False, ascending=True)
+        assert_frame_equal(result, expected)
+
+        expected = DataFrame([[1., nan, 2.],
+                              [2., 1., 3.]])
+        result = df.rank(1, numeric_only=False, ascending=False)
         assert_frame_equal(result, expected)
 
         # mixed-type frames
diff --git a/pandas/tests/test_categorical.py b/pandas/tests/test_categorical.py
@@ -308,6 +308,38 @@ def test_constructor_with_generator(self):
         cat = pd.Categorical([0, 1, 2], categories=xrange(3))
         self.assertTrue(cat.equals(exp))
 
+    def test_constructor_with_datetimelike(self):
+
+        # 12077
+        # constructor wwth a datetimelike and NaT
+
+        for dtl in [pd.date_range('1995-01-01 00:00:00',
+                                  periods=5, freq='s'),
+                    pd.date_range('1995-01-01 00:00:00',
+                                  periods=5, freq='s', tz='US/Eastern'),
+                    pd.timedelta_range('1 day', periods=5, freq='s')]:
+
+            s = Series(dtl)
+            c = Categorical(s)
+            expected = type(dtl)(s)
+            expected.freq = None
+            tm.assert_index_equal(c.categories, expected)
+            self.assert_numpy_array_equal(c.codes, np.arange(5, dtype='int8'))
+
+            # with NaT
+            s2 = s.copy()
+            s2.iloc[-1] = pd.NaT
+            c = Categorical(s2)
+            expected = type(dtl)(s2.dropna())
+            expected.freq = None
+            tm.assert_index_equal(c.categories, expected)
+            self.assert_numpy_array_equal(c.codes,
+                                          np.concatenate([np.arange(4, dtype='int8'),
+                                                      [-1]]))
+
+            result = repr(c)
+            self.assertTrue('NaT' in result)
+
     def test_from_codes(self):
 
         # too few categories
diff --git a/pandas/tseries/period.py b/pandas/tseries/period.py
@@ -832,7 +832,7 @@ def _format_native_types(self, na_rep=u('NaT'), date_format=None,
         values[imask] = np.array([formatter(dt) for dt in values[imask]])
         return values
 
-    def take(self, indices, axis=0):
+    def take(self, indices, axis=0, allow_fill=True, fill_value=None):
         """
         Analogous to ndarray.take
         """