From e1385d84b138675231bfdd062e1a7c5c19a4f398 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Sun, 24 Jan 2016 19:07:23 -0500 Subject: [PATCH 1/2] PERF: add support for NaT in hashtable factorizers, improving Categorical construction with NaT, #12077 --- asv_bench/benchmarks/categoricals.py | 16 +++++++++++++ doc/source/whatsnew/v0.18.0.txt | 3 ++- pandas/core/algorithms.py | 34 +++++++++++++++------------- pandas/core/categorical.py | 4 ++-- pandas/hashtable.pyx | 34 ++++++++++++++++++---------- pandas/tests/test_categorical.py | 32 ++++++++++++++++++++++++++ pandas/tseries/period.py | 2 +- 7 files changed, 93 insertions(+), 32 deletions(-) diff --git a/asv_bench/benchmarks/categoricals.py b/asv_bench/benchmarks/categoricals.py index d32c19d6d0bb8..244af3a577fe2 100644 --- a/asv_bench/benchmarks/categoricals.py +++ b/asv_bench/benchmarks/categoricals.py @@ -46,6 +46,22 @@ def time_fastpath(self): Categorical(self.codes, self.cat_idx, fastpath=True) +class categorical_constructor_with_datetimes(object): + goal_time = 0.2 + + def setup(self): + self.datetimes = pd.Series(pd.date_range( + '1995-01-01 00:00:00', periods=10000, freq='s')) + + def time_datetimes(self): + Categorical(self.datetimes) + + def time_datetimes_with_nat(self): + t = self.datetimes + t.iloc[-1] = pd.NaT + Categorical(t) + + class categorical_rendering(object): goal_time = 3e-3 diff --git a/doc/source/whatsnew/v0.18.0.txt b/doc/source/whatsnew/v0.18.0.txt index 75a38544fb8eb..115e286acdac1 100644 --- a/doc/source/whatsnew/v0.18.0.txt +++ b/doc/source/whatsnew/v0.18.0.txt @@ -458,7 +458,7 @@ Performance Improvements - Improved huge ``DatetimeIndex``, ``PeriodIndex`` and ``TimedeltaIndex``'s ops performance including ``NaT`` (:issue:`10277`) - Improved performance of ``pandas.concat`` (:issue:`11958`) - Improved performance of ``StataReader`` (:issue:`11591`) - +- Improved performance in construction of ``Categoricals`` with Series of datetimes containing ``NaT`` (:issue:`12077`) @@ -481,6 +481,7 @@ Bug Fixes - Bug in vectorized ``DateOffset`` when ``n`` parameter is ``0`` (:issue:`11370`) - Compat for numpy 1.11 w.r.t. ``NaT`` comparison changes (:issue:`12049`) - Bug in ``read_csv`` when reading from a ``StringIO`` in threads (:issue:`11790`) +- Bug in not treating ``NaT`` as a missing value in datetimelikes when factorizing & with ``Categoricals`` (:issue:`12077`) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index d1c983769ed2a..d516471ededb6 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -11,6 +11,7 @@ import pandas.algos as algos import pandas.hashtable as htable from pandas.compat import string_types +from pandas.tslib import iNaT def match(to_match, values, na_sentinel=-1): @@ -182,17 +183,23 @@ def factorize(values, sort=False, order=None, na_sentinel=-1, size_hint=None): "https://github.com/pydata/pandas/issues/6926" warn(msg, FutureWarning, stacklevel=2) - from pandas.core.index import Index - from pandas.core.series import Series + from pandas import Index, Series, DatetimeIndex + vals = np.asarray(values) + # localize to UTC + is_datetimetz = com.is_datetimetz(values) + if is_datetimetz: + values = DatetimeIndex(values) + vals = values.tz_localize(None) + is_datetime = com.is_datetime64_dtype(vals) is_timedelta = com.is_timedelta64_dtype(vals) (hash_klass, vec_klass), vals = _get_data_algo(vals, _hashtables) table = hash_klass(size_hint or len(vals)) uniques = vec_klass() - labels = table.get_labels(vals, uniques, 0, na_sentinel) + labels = table.get_labels(vals, uniques, 0, na_sentinel, True) labels = com._ensure_platform_int(labels) @@ -224,7 +231,12 @@ def factorize(values, sort=False, order=None, na_sentinel=-1, size_hint=None): uniques = uniques.take(sorter) - if is_datetime: + if is_datetimetz: + + # reset tz + uniques = DatetimeIndex(uniques.astype('M8[ns]')).tz_localize( + values.tz) + elif is_datetime: uniques = uniques.astype('M8[ns]') elif is_timedelta: uniques = uniques.astype('m8[ns]') @@ -296,7 +308,6 @@ def value_counts(values, sort=True, ascending=False, normalize=False, keys, counts = htable.value_count_scalar64(values, dropna) if dropna: - from pandas.tslib import iNaT msk = keys != iNaT keys, counts = keys[msk], counts[msk] @@ -478,22 +489,13 @@ def _interpolate(a, b, fraction): def _get_data_algo(values, func_map): - mask = None if com.is_float_dtype(values): f = func_map['float64'] values = com._ensure_float64(values) elif com.needs_i8_conversion(values): - - # if we have NaT, punt to object dtype - mask = com.isnull(values) - if mask.ravel().any(): - f = func_map['generic'] - values = com._ensure_object(values) - values[mask] = np.nan - else: - f = func_map['int64'] - values = values.view('i8') + f = func_map['int64'] + values = values.view('i8') elif com.is_integer_dtype(values): f = func_map['int64'] diff --git a/pandas/core/categorical.py b/pandas/core/categorical.py index 8a6ea69058c7e..23740f1983b43 100644 --- a/pandas/core/categorical.py +++ b/pandas/core/categorical.py @@ -257,7 +257,7 @@ def __init__(self, values, categories=None, ordered=False, name=None, categories = values.categories values = values.__array__() - elif isinstance(values, ABCIndexClass): + elif isinstance(values, (ABCIndexClass, ABCSeries)): pass else: @@ -1177,7 +1177,7 @@ def get_values(self): """ # if we are a datetime and period index, return Index to keep metadata if com.is_datetimelike(self.categories): - return self.categories.take(self._codes) + return self.categories.take(self._codes, fill_value=np.nan) return np.array(self) def check_for_ordered(self, op): diff --git a/pandas/hashtable.pyx b/pandas/hashtable.pyx index a5fcbd3f2d0f1..f718c1ab0b8da 100644 --- a/pandas/hashtable.pyx +++ b/pandas/hashtable.pyx @@ -377,12 +377,13 @@ cdef class Int64HashTable(HashTable): def factorize(self, ndarray[object] values): reverse = {} - labels = self.get_labels(values, reverse, 0) + labels = self.get_labels(values, reverse, 0, 0) return reverse, labels @cython.boundscheck(False) def get_labels(self, int64_t[:] values, Int64Vector uniques, - Py_ssize_t count_prior, Py_ssize_t na_sentinel): + Py_ssize_t count_prior, Py_ssize_t na_sentinel, + bint check_null=True): cdef: Py_ssize_t i, n = len(values) int64_t[:] labels @@ -399,6 +400,11 @@ cdef class Int64HashTable(HashTable): for i in range(n): val = values[i] k = kh_get_int64(self.table, val) + + if check_null and val == iNaT: + labels[i] = na_sentinel + continue + if k != self.table.n_buckets: idx = self.table.vals[k] labels[i] = idx @@ -525,13 +531,14 @@ cdef class Float64HashTable(HashTable): def factorize(self, float64_t[:] values): uniques = Float64Vector() - labels = self.get_labels(values, uniques, 0, -1) + labels = self.get_labels(values, uniques, 0, -1, 1) return uniques.to_array(), labels @cython.boundscheck(False) def get_labels(self, float64_t[:] values, - Float64Vector uniques, - Py_ssize_t count_prior, int64_t na_sentinel): + Float64Vector uniques, + Py_ssize_t count_prior, int64_t na_sentinel, + bint check_null=True): cdef: Py_ssize_t i, n = len(values) int64_t[:] labels @@ -548,7 +555,7 @@ cdef class Float64HashTable(HashTable): for i in range(n): val = values[i] - if val != val: + if check_null and val != val: labels[i] = na_sentinel continue @@ -762,7 +769,8 @@ cdef class PyObjectHashTable(HashTable): return uniques.to_array() def get_labels(self, ndarray[object] values, ObjectVector uniques, - Py_ssize_t count_prior, int64_t na_sentinel): + Py_ssize_t count_prior, int64_t na_sentinel, + bint check_null=True): cdef: Py_ssize_t i, n = len(values) int64_t[:] labels @@ -777,7 +785,7 @@ cdef class PyObjectHashTable(HashTable): val = values[i] hash(val) - if val != val or val is None: + if check_null and val != val or val is None: labels[i] = na_sentinel continue @@ -808,14 +816,15 @@ cdef class Factorizer: def get_count(self): return self.count - def factorize(self, ndarray[object] values, sort=False, na_sentinel=-1): + def factorize(self, ndarray[object] values, sort=False, na_sentinel=-1, + check_null=True): """ Factorize values with nans replaced by na_sentinel >>> factorize(np.array([1,2,np.nan], dtype='O'), na_sentinel=20) array([ 0, 1, 20]) """ labels = self.table.get_labels(values, self.uniques, - self.count, na_sentinel) + self.count, na_sentinel, check_null) mask = (labels == na_sentinel) # sort on if sort: @@ -848,9 +857,10 @@ cdef class Int64Factorizer: return self.count def factorize(self, int64_t[:] values, sort=False, - na_sentinel=-1): + na_sentinel=-1, check_null=True): labels = self.table.get_labels(values, self.uniques, - self.count, na_sentinel) + self.count, na_sentinel, + check_null) # sort on if sort: diff --git a/pandas/tests/test_categorical.py b/pandas/tests/test_categorical.py index 8a9827b9d5533..733ed2fbcb971 100755 --- a/pandas/tests/test_categorical.py +++ b/pandas/tests/test_categorical.py @@ -308,6 +308,38 @@ def test_constructor_with_generator(self): cat = pd.Categorical([0, 1, 2], categories=xrange(3)) self.assertTrue(cat.equals(exp)) + def test_constructor_with_datetimelike(self): + + # 12077 + # constructor wwth a datetimelike and NaT + + for dtl in [pd.date_range('1995-01-01 00:00:00', + periods=5, freq='s'), + pd.date_range('1995-01-01 00:00:00', + periods=5, freq='s', tz='US/Eastern'), + pd.timedelta_range('1 day', periods=5, freq='s')]: + + s = Series(dtl) + c = Categorical(s) + expected = type(dtl)(s) + expected.freq = None + tm.assert_index_equal(c.categories, expected) + self.assert_numpy_array_equal(c.codes, np.arange(5, dtype='int8')) + + # with NaT + s2 = s.copy() + s2.iloc[-1] = pd.NaT + c = Categorical(s2) + expected = type(dtl)(s2.dropna()) + expected.freq = None + tm.assert_index_equal(c.categories, expected) + self.assert_numpy_array_equal(c.codes, + np.concatenate([np.arange(4, dtype='int8'), + [-1]])) + + result = repr(c) + self.assertTrue('NaT' in result) + def test_from_codes(self): # too few categories diff --git a/pandas/tseries/period.py b/pandas/tseries/period.py index 911277429ce86..05ca65d6946fb 100644 --- a/pandas/tseries/period.py +++ b/pandas/tseries/period.py @@ -832,7 +832,7 @@ def _format_native_types(self, na_rep=u('NaT'), date_format=None, values[imask] = np.array([formatter(dt) for dt in values[imask]]) return values - def take(self, indices, axis=0): + def take(self, indices, axis=0, allow_fill=True, fill_value=None): """ Analogous to ndarray.take """ From ca6e471e03c124700c0eaee3cef2f78a64408e01 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Sun, 24 Jan 2016 21:19:26 -0500 Subject: [PATCH 2/2] rank functions now handle int64 and missing values --- pandas/algos.pyx | 41 +++++++++++++++++++++++++--- pandas/tests/frame/test_analytics.py | 7 ++++- 2 files changed, 43 insertions(+), 5 deletions(-) diff --git a/pandas/algos.pyx b/pandas/algos.pyx index 62ee6ced84882..0f9ceba48e608 100644 --- a/pandas/algos.pyx +++ b/pandas/algos.pyx @@ -226,14 +226,27 @@ def rank_1d_int64(object in_arr, ties_method='average', ascending=True, ndarray[int64_t] sorted_data, values ndarray[float64_t] ranks ndarray[int64_t] argsorted - int64_t val + int64_t val, nan_value float64_t sum_ranks = 0 + bint keep_na int tiebreak = 0 float count = 0.0 tiebreak = tiebreakers[ties_method] + keep_na = na_option == 'keep' + values = np.asarray(in_arr) + if ascending ^ (na_option == 'top'): + nan_value = np.iinfo('int64').max + else: + nan_value = np.iinfo('int64').min + + # unlike floats, which have np.inf, -np.inf, and np.nan + # ints do not + mask = values == iNaT + np.putmask(values, mask, nan_value) + n = len(values) ranks = np.empty(n, dtype='f8') @@ -256,6 +269,9 @@ def rank_1d_int64(object in_arr, ties_method='average', ascending=True, sum_ranks += i + 1 dups += 1 val = sorted_data[i] + if (val == nan_value) and keep_na: + ranks[argsorted[i]] = nan + continue count += 1.0 if i == n - 1 or fabs(sorted_data[i + 1] - val) > 0: if tiebreak == TIEBREAK_AVERAGE: @@ -387,16 +403,30 @@ def rank_2d_int64(object in_arr, axis=0, ties_method='average', ndarray[float64_t, ndim=2] ranks ndarray[int64_t, ndim=2] argsorted ndarray[int64_t, ndim=2, cast=True] values - int64_t val + int64_t val, nan_value float64_t sum_ranks = 0 + bint keep_na = 0 int tiebreak = 0 float count = 0.0 tiebreak = tiebreakers[ties_method] + keep_na = na_option == 'keep' + + in_arr = np.asarray(in_arr) + if axis == 0: - values = np.asarray(in_arr).T + values = in_arr.T.copy() + else: + values = in_arr.copy() + + if ascending ^ (na_option == 'top'): + nan_value = np.iinfo('int64').max else: - values = np.asarray(in_arr) + nan_value = np.iinfo('int64').min + + # unlike floats, which have np.inf, -np.inf, and np.nan + # ints do not + np.putmask(values, values == iNaT, nan_value) n, k = ( values).shape ranks = np.empty((n, k), dtype='f8') @@ -423,6 +453,9 @@ def rank_2d_int64(object in_arr, axis=0, ties_method='average', sum_ranks += j + 1 dups += 1 val = values[i, j] + if val == nan_value and keep_na: + ranks[i, argsorted[i, j]] = nan + continue count += 1.0 if j == k - 1 or fabs(values[i, j + 1] - val) > FP_ERR: if tiebreak == TIEBREAK_AVERAGE: diff --git a/pandas/tests/frame/test_analytics.py b/pandas/tests/frame/test_analytics.py index f68faf99d3143..e1ba981e93d2e 100644 --- a/pandas/tests/frame/test_analytics.py +++ b/pandas/tests/frame/test_analytics.py @@ -787,7 +787,12 @@ def test_rank2(self): # check the rank expected = DataFrame([[2., nan, 1.], [2., 3., 1.]]) - result = df.rank(1, numeric_only=False) + result = df.rank(1, numeric_only=False, ascending=True) + assert_frame_equal(result, expected) + + expected = DataFrame([[1., nan, 2.], + [2., 1., 3.]]) + result = df.rank(1, numeric_only=False, ascending=False) assert_frame_equal(result, expected) # mixed-type frames