Skip to content

Commit 81bb972

Browse files
committed
PERF: add support for NaT in hashtable factorizers, improving Categorical construction with NaT
closes #12077 ``` before after ratio [1330b9f] [404911c6] 17.25ms 1.21ms 0.07 categoricals.categorical_con structor_with_datetimes.time_datetimes_with_nat ``` Author: Jeff Reback <[email protected]> Closes #12128 from jreback/cat_perf and squashes the following commits: ca6e471 [Jeff Reback] rank functions now handle int64 and missing values e1385d8 [Jeff Reback] PERF: add support for NaT in hashtable factorizers, improving Categorical construction with NaT, #12077
1 parent 1330b9f commit 81bb972

File tree

9 files changed

+136
-37
lines changed

9 files changed

+136
-37
lines changed

asv_bench/benchmarks/categoricals.py

+16
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,22 @@ def time_fastpath(self):
4646
Categorical(self.codes, self.cat_idx, fastpath=True)
4747

4848

49+
class categorical_constructor_with_datetimes(object):
50+
goal_time = 0.2
51+
52+
def setup(self):
53+
self.datetimes = pd.Series(pd.date_range(
54+
'1995-01-01 00:00:00', periods=10000, freq='s'))
55+
56+
def time_datetimes(self):
57+
Categorical(self.datetimes)
58+
59+
def time_datetimes_with_nat(self):
60+
t = self.datetimes
61+
t.iloc[-1] = pd.NaT
62+
Categorical(t)
63+
64+
4965
class categorical_rendering(object):
5066
goal_time = 3e-3
5167

doc/source/whatsnew/v0.18.0.txt

+2-1
Original file line numberDiff line numberDiff line change
@@ -458,7 +458,7 @@ Performance Improvements
458458
- Improved huge ``DatetimeIndex``, ``PeriodIndex`` and ``TimedeltaIndex``'s ops performance including ``NaT`` (:issue:`10277`)
459459
- Improved performance of ``pandas.concat`` (:issue:`11958`)
460460
- Improved performance of ``StataReader`` (:issue:`11591`)
461-
461+
- Improved performance in construction of ``Categoricals`` with Series of datetimes containing ``NaT`` (:issue:`12077`)
462462

463463

464464

@@ -481,6 +481,7 @@ Bug Fixes
481481
- Bug in vectorized ``DateOffset`` when ``n`` parameter is ``0`` (:issue:`11370`)
482482
- Compat for numpy 1.11 w.r.t. ``NaT`` comparison changes (:issue:`12049`)
483483
- Bug in ``read_csv`` when reading from a ``StringIO`` in threads (:issue:`11790`)
484+
- Bug in not treating ``NaT`` as a missing value in datetimelikes when factorizing & with ``Categoricals`` (:issue:`12077`)
484485

485486

486487

pandas/algos.pyx

+37-4
Original file line numberDiff line numberDiff line change
@@ -226,14 +226,27 @@ def rank_1d_int64(object in_arr, ties_method='average', ascending=True,
226226
ndarray[int64_t] sorted_data, values
227227
ndarray[float64_t] ranks
228228
ndarray[int64_t] argsorted
229-
int64_t val
229+
int64_t val, nan_value
230230
float64_t sum_ranks = 0
231+
bint keep_na
231232
int tiebreak = 0
232233
float count = 0.0
233234
tiebreak = tiebreakers[ties_method]
234235

236+
keep_na = na_option == 'keep'
237+
235238
values = np.asarray(in_arr)
236239

240+
if ascending ^ (na_option == 'top'):
241+
nan_value = np.iinfo('int64').max
242+
else:
243+
nan_value = np.iinfo('int64').min
244+
245+
# unlike floats, which have np.inf, -np.inf, and np.nan
246+
# ints do not
247+
mask = values == iNaT
248+
np.putmask(values, mask, nan_value)
249+
237250
n = len(values)
238251
ranks = np.empty(n, dtype='f8')
239252

@@ -256,6 +269,9 @@ def rank_1d_int64(object in_arr, ties_method='average', ascending=True,
256269
sum_ranks += i + 1
257270
dups += 1
258271
val = sorted_data[i]
272+
if (val == nan_value) and keep_na:
273+
ranks[argsorted[i]] = nan
274+
continue
259275
count += 1.0
260276
if i == n - 1 or fabs(sorted_data[i + 1] - val) > 0:
261277
if tiebreak == TIEBREAK_AVERAGE:
@@ -387,16 +403,30 @@ def rank_2d_int64(object in_arr, axis=0, ties_method='average',
387403
ndarray[float64_t, ndim=2] ranks
388404
ndarray[int64_t, ndim=2] argsorted
389405
ndarray[int64_t, ndim=2, cast=True] values
390-
int64_t val
406+
int64_t val, nan_value
391407
float64_t sum_ranks = 0
408+
bint keep_na = 0
392409
int tiebreak = 0
393410
float count = 0.0
394411
tiebreak = tiebreakers[ties_method]
395412

413+
keep_na = na_option == 'keep'
414+
415+
in_arr = np.asarray(in_arr)
416+
396417
if axis == 0:
397-
values = np.asarray(in_arr).T
418+
values = in_arr.T.copy()
419+
else:
420+
values = in_arr.copy()
421+
422+
if ascending ^ (na_option == 'top'):
423+
nan_value = np.iinfo('int64').max
398424
else:
399-
values = np.asarray(in_arr)
425+
nan_value = np.iinfo('int64').min
426+
427+
# unlike floats, which have np.inf, -np.inf, and np.nan
428+
# ints do not
429+
np.putmask(values, values == iNaT, nan_value)
400430

401431
n, k = (<object> values).shape
402432
ranks = np.empty((n, k), dtype='f8')
@@ -423,6 +453,9 @@ def rank_2d_int64(object in_arr, axis=0, ties_method='average',
423453
sum_ranks += j + 1
424454
dups += 1
425455
val = values[i, j]
456+
if val == nan_value and keep_na:
457+
ranks[i, argsorted[i, j]] = nan
458+
continue
426459
count += 1.0
427460
if j == k - 1 or fabs(values[i, j + 1] - val) > FP_ERR:
428461
if tiebreak == TIEBREAK_AVERAGE:

pandas/core/algorithms.py

+18-16
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
import pandas.algos as algos
1212
import pandas.hashtable as htable
1313
from pandas.compat import string_types
14+
from pandas.tslib import iNaT
1415

1516

1617
def match(to_match, values, na_sentinel=-1):
@@ -182,17 +183,23 @@ def factorize(values, sort=False, order=None, na_sentinel=-1, size_hint=None):
182183
"https://github.com/pydata/pandas/issues/6926"
183184
warn(msg, FutureWarning, stacklevel=2)
184185

185-
from pandas.core.index import Index
186-
from pandas.core.series import Series
186+
from pandas import Index, Series, DatetimeIndex
187+
187188
vals = np.asarray(values)
188189

190+
# localize to UTC
191+
is_datetimetz = com.is_datetimetz(values)
192+
if is_datetimetz:
193+
values = DatetimeIndex(values)
194+
vals = values.tz_localize(None)
195+
189196
is_datetime = com.is_datetime64_dtype(vals)
190197
is_timedelta = com.is_timedelta64_dtype(vals)
191198
(hash_klass, vec_klass), vals = _get_data_algo(vals, _hashtables)
192199

193200
table = hash_klass(size_hint or len(vals))
194201
uniques = vec_klass()
195-
labels = table.get_labels(vals, uniques, 0, na_sentinel)
202+
labels = table.get_labels(vals, uniques, 0, na_sentinel, True)
196203

197204
labels = com._ensure_platform_int(labels)
198205

@@ -224,7 +231,12 @@ def factorize(values, sort=False, order=None, na_sentinel=-1, size_hint=None):
224231

225232
uniques = uniques.take(sorter)
226233

227-
if is_datetime:
234+
if is_datetimetz:
235+
236+
# reset tz
237+
uniques = DatetimeIndex(uniques.astype('M8[ns]')).tz_localize(
238+
values.tz)
239+
elif is_datetime:
228240
uniques = uniques.astype('M8[ns]')
229241
elif is_timedelta:
230242
uniques = uniques.astype('m8[ns]')
@@ -296,7 +308,6 @@ def value_counts(values, sort=True, ascending=False, normalize=False,
296308
keys, counts = htable.value_count_scalar64(values, dropna)
297309

298310
if dropna:
299-
from pandas.tslib import iNaT
300311
msk = keys != iNaT
301312
keys, counts = keys[msk], counts[msk]
302313

@@ -478,22 +489,13 @@ def _interpolate(a, b, fraction):
478489

479490

480491
def _get_data_algo(values, func_map):
481-
mask = None
482492
if com.is_float_dtype(values):
483493
f = func_map['float64']
484494
values = com._ensure_float64(values)
485495

486496
elif com.needs_i8_conversion(values):
487-
488-
# if we have NaT, punt to object dtype
489-
mask = com.isnull(values)
490-
if mask.ravel().any():
491-
f = func_map['generic']
492-
values = com._ensure_object(values)
493-
values[mask] = np.nan
494-
else:
495-
f = func_map['int64']
496-
values = values.view('i8')
497+
f = func_map['int64']
498+
values = values.view('i8')
497499

498500
elif com.is_integer_dtype(values):
499501
f = func_map['int64']

pandas/core/categorical.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -257,7 +257,7 @@ def __init__(self, values, categories=None, ordered=False, name=None,
257257
categories = values.categories
258258
values = values.__array__()
259259

260-
elif isinstance(values, ABCIndexClass):
260+
elif isinstance(values, (ABCIndexClass, ABCSeries)):
261261
pass
262262

263263
else:
@@ -1177,7 +1177,7 @@ def get_values(self):
11771177
"""
11781178
# if we are a datetime and period index, return Index to keep metadata
11791179
if com.is_datetimelike(self.categories):
1180-
return self.categories.take(self._codes)
1180+
return self.categories.take(self._codes, fill_value=np.nan)
11811181
return np.array(self)
11821182

11831183
def check_for_ordered(self, op):

pandas/hashtable.pyx

+22-12
Original file line numberDiff line numberDiff line change
@@ -377,12 +377,13 @@ cdef class Int64HashTable(HashTable):
377377

378378
def factorize(self, ndarray[object] values):
379379
reverse = {}
380-
labels = self.get_labels(values, reverse, 0)
380+
labels = self.get_labels(values, reverse, 0, 0)
381381
return reverse, labels
382382

383383
@cython.boundscheck(False)
384384
def get_labels(self, int64_t[:] values, Int64Vector uniques,
385-
Py_ssize_t count_prior, Py_ssize_t na_sentinel):
385+
Py_ssize_t count_prior, Py_ssize_t na_sentinel,
386+
bint check_null=True):
386387
cdef:
387388
Py_ssize_t i, n = len(values)
388389
int64_t[:] labels
@@ -399,6 +400,11 @@ cdef class Int64HashTable(HashTable):
399400
for i in range(n):
400401
val = values[i]
401402
k = kh_get_int64(self.table, val)
403+
404+
if check_null and val == iNaT:
405+
labels[i] = na_sentinel
406+
continue
407+
402408
if k != self.table.n_buckets:
403409
idx = self.table.vals[k]
404410
labels[i] = idx
@@ -525,13 +531,14 @@ cdef class Float64HashTable(HashTable):
525531

526532
def factorize(self, float64_t[:] values):
527533
uniques = Float64Vector()
528-
labels = self.get_labels(values, uniques, 0, -1)
534+
labels = self.get_labels(values, uniques, 0, -1, 1)
529535
return uniques.to_array(), labels
530536

531537
@cython.boundscheck(False)
532538
def get_labels(self, float64_t[:] values,
533-
Float64Vector uniques,
534-
Py_ssize_t count_prior, int64_t na_sentinel):
539+
Float64Vector uniques,
540+
Py_ssize_t count_prior, int64_t na_sentinel,
541+
bint check_null=True):
535542
cdef:
536543
Py_ssize_t i, n = len(values)
537544
int64_t[:] labels
@@ -548,7 +555,7 @@ cdef class Float64HashTable(HashTable):
548555
for i in range(n):
549556
val = values[i]
550557

551-
if val != val:
558+
if check_null and val != val:
552559
labels[i] = na_sentinel
553560
continue
554561

@@ -762,7 +769,8 @@ cdef class PyObjectHashTable(HashTable):
762769
return uniques.to_array()
763770

764771
def get_labels(self, ndarray[object] values, ObjectVector uniques,
765-
Py_ssize_t count_prior, int64_t na_sentinel):
772+
Py_ssize_t count_prior, int64_t na_sentinel,
773+
bint check_null=True):
766774
cdef:
767775
Py_ssize_t i, n = len(values)
768776
int64_t[:] labels
@@ -777,7 +785,7 @@ cdef class PyObjectHashTable(HashTable):
777785
val = values[i]
778786
hash(val)
779787

780-
if val != val or val is None:
788+
if check_null and val != val or val is None:
781789
labels[i] = na_sentinel
782790
continue
783791

@@ -808,14 +816,15 @@ cdef class Factorizer:
808816
def get_count(self):
809817
return self.count
810818

811-
def factorize(self, ndarray[object] values, sort=False, na_sentinel=-1):
819+
def factorize(self, ndarray[object] values, sort=False, na_sentinel=-1,
820+
check_null=True):
812821
"""
813822
Factorize values with nans replaced by na_sentinel
814823
>>> factorize(np.array([1,2,np.nan], dtype='O'), na_sentinel=20)
815824
array([ 0, 1, 20])
816825
"""
817826
labels = self.table.get_labels(values, self.uniques,
818-
self.count, na_sentinel)
827+
self.count, na_sentinel, check_null)
819828
mask = (labels == na_sentinel)
820829
# sort on
821830
if sort:
@@ -848,9 +857,10 @@ cdef class Int64Factorizer:
848857
return self.count
849858

850859
def factorize(self, int64_t[:] values, sort=False,
851-
na_sentinel=-1):
860+
na_sentinel=-1, check_null=True):
852861
labels = self.table.get_labels(values, self.uniques,
853-
self.count, na_sentinel)
862+
self.count, na_sentinel,
863+
check_null)
854864

855865
# sort on
856866
if sort:

pandas/tests/frame/test_analytics.py

+6-1
Original file line numberDiff line numberDiff line change
@@ -787,7 +787,12 @@ def test_rank2(self):
787787
# check the rank
788788
expected = DataFrame([[2., nan, 1.],
789789
[2., 3., 1.]])
790-
result = df.rank(1, numeric_only=False)
790+
result = df.rank(1, numeric_only=False, ascending=True)
791+
assert_frame_equal(result, expected)
792+
793+
expected = DataFrame([[1., nan, 2.],
794+
[2., 1., 3.]])
795+
result = df.rank(1, numeric_only=False, ascending=False)
791796
assert_frame_equal(result, expected)
792797

793798
# mixed-type frames

pandas/tests/test_categorical.py

+32
Original file line numberDiff line numberDiff line change
@@ -308,6 +308,38 @@ def test_constructor_with_generator(self):
308308
cat = pd.Categorical([0, 1, 2], categories=xrange(3))
309309
self.assertTrue(cat.equals(exp))
310310

311+
def test_constructor_with_datetimelike(self):
312+
313+
# 12077
314+
# constructor wwth a datetimelike and NaT
315+
316+
for dtl in [pd.date_range('1995-01-01 00:00:00',
317+
periods=5, freq='s'),
318+
pd.date_range('1995-01-01 00:00:00',
319+
periods=5, freq='s', tz='US/Eastern'),
320+
pd.timedelta_range('1 day', periods=5, freq='s')]:
321+
322+
s = Series(dtl)
323+
c = Categorical(s)
324+
expected = type(dtl)(s)
325+
expected.freq = None
326+
tm.assert_index_equal(c.categories, expected)
327+
self.assert_numpy_array_equal(c.codes, np.arange(5, dtype='int8'))
328+
329+
# with NaT
330+
s2 = s.copy()
331+
s2.iloc[-1] = pd.NaT
332+
c = Categorical(s2)
333+
expected = type(dtl)(s2.dropna())
334+
expected.freq = None
335+
tm.assert_index_equal(c.categories, expected)
336+
self.assert_numpy_array_equal(c.codes,
337+
np.concatenate([np.arange(4, dtype='int8'),
338+
[-1]]))
339+
340+
result = repr(c)
341+
self.assertTrue('NaT' in result)
342+
311343
def test_from_codes(self):
312344

313345
# too few categories

pandas/tseries/period.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -832,7 +832,7 @@ def _format_native_types(self, na_rep=u('NaT'), date_format=None,
832832
values[imask] = np.array([formatter(dt) for dt in values[imask]])
833833
return values
834834

835-
def take(self, indices, axis=0):
835+
def take(self, indices, axis=0, allow_fill=True, fill_value=None):
836836
"""
837837
Analogous to ndarray.take
838838
"""

0 commit comments

Comments
 (0)