Skip to content

PERF: add support for NaT in hashtable factorizers, improving Categorical construction with NaT #12128

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 2 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 16 additions & 0 deletions asv_bench/benchmarks/categoricals.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,22 @@ def time_fastpath(self):
Categorical(self.codes, self.cat_idx, fastpath=True)


class categorical_constructor_with_datetimes(object):
goal_time = 0.2

def setup(self):
self.datetimes = pd.Series(pd.date_range(
'1995-01-01 00:00:00', periods=10000, freq='s'))

def time_datetimes(self):
Categorical(self.datetimes)

def time_datetimes_with_nat(self):
t = self.datetimes
t.iloc[-1] = pd.NaT
Categorical(t)


class categorical_rendering(object):
goal_time = 3e-3

Expand Down
3 changes: 2 additions & 1 deletion doc/source/whatsnew/v0.18.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -458,7 +458,7 @@ Performance Improvements
- Improved huge ``DatetimeIndex``, ``PeriodIndex`` and ``TimedeltaIndex``'s ops performance including ``NaT`` (:issue:`10277`)
- Improved performance of ``pandas.concat`` (:issue:`11958`)
- Improved performance of ``StataReader`` (:issue:`11591`)

- Improved performance in construction of ``Categoricals`` with Series of datetimes containing ``NaT`` (:issue:`12077`)



Expand All @@ -481,6 +481,7 @@ Bug Fixes
- Bug in vectorized ``DateOffset`` when ``n`` parameter is ``0`` (:issue:`11370`)
- Compat for numpy 1.11 w.r.t. ``NaT`` comparison changes (:issue:`12049`)
- Bug in ``read_csv`` when reading from a ``StringIO`` in threads (:issue:`11790`)
- Bug in not treating ``NaT`` as a missing value in datetimelikes when factorizing & with ``Categoricals`` (:issue:`12077`)



Expand Down
41 changes: 37 additions & 4 deletions pandas/algos.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -226,14 +226,27 @@ def rank_1d_int64(object in_arr, ties_method='average', ascending=True,
ndarray[int64_t] sorted_data, values
ndarray[float64_t] ranks
ndarray[int64_t] argsorted
int64_t val
int64_t val, nan_value
float64_t sum_ranks = 0
bint keep_na
int tiebreak = 0
float count = 0.0
tiebreak = tiebreakers[ties_method]

keep_na = na_option == 'keep'

values = np.asarray(in_arr)

if ascending ^ (na_option == 'top'):
nan_value = np.iinfo('int64').max
else:
nan_value = np.iinfo('int64').min

# unlike floats, which have np.inf, -np.inf, and np.nan
# ints do not
mask = values == iNaT
np.putmask(values, mask, nan_value)

n = len(values)
ranks = np.empty(n, dtype='f8')

Expand All @@ -256,6 +269,9 @@ def rank_1d_int64(object in_arr, ties_method='average', ascending=True,
sum_ranks += i + 1
dups += 1
val = sorted_data[i]
if (val == nan_value) and keep_na:
ranks[argsorted[i]] = nan
continue
count += 1.0
if i == n - 1 or fabs(sorted_data[i + 1] - val) > 0:
if tiebreak == TIEBREAK_AVERAGE:
Expand Down Expand Up @@ -387,16 +403,30 @@ def rank_2d_int64(object in_arr, axis=0, ties_method='average',
ndarray[float64_t, ndim=2] ranks
ndarray[int64_t, ndim=2] argsorted
ndarray[int64_t, ndim=2, cast=True] values
int64_t val
int64_t val, nan_value
float64_t sum_ranks = 0
bint keep_na = 0
int tiebreak = 0
float count = 0.0
tiebreak = tiebreakers[ties_method]

keep_na = na_option == 'keep'

in_arr = np.asarray(in_arr)

if axis == 0:
values = np.asarray(in_arr).T
values = in_arr.T.copy()
else:
values = in_arr.copy()

if ascending ^ (na_option == 'top'):
nan_value = np.iinfo('int64').max
else:
values = np.asarray(in_arr)
nan_value = np.iinfo('int64').min

# unlike floats, which have np.inf, -np.inf, and np.nan
# ints do not
np.putmask(values, values == iNaT, nan_value)

n, k = (<object> values).shape
ranks = np.empty((n, k), dtype='f8')
Expand All @@ -423,6 +453,9 @@ def rank_2d_int64(object in_arr, axis=0, ties_method='average',
sum_ranks += j + 1
dups += 1
val = values[i, j]
if val == nan_value and keep_na:
ranks[i, argsorted[i, j]] = nan
continue
count += 1.0
if j == k - 1 or fabs(values[i, j + 1] - val) > FP_ERR:
if tiebreak == TIEBREAK_AVERAGE:
Expand Down
34 changes: 18 additions & 16 deletions pandas/core/algorithms.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
import pandas.algos as algos
import pandas.hashtable as htable
from pandas.compat import string_types
from pandas.tslib import iNaT


def match(to_match, values, na_sentinel=-1):
Expand Down Expand Up @@ -182,17 +183,23 @@ def factorize(values, sort=False, order=None, na_sentinel=-1, size_hint=None):
"https://github.com/pydata/pandas/issues/6926"
warn(msg, FutureWarning, stacklevel=2)

from pandas.core.index import Index
from pandas.core.series import Series
from pandas import Index, Series, DatetimeIndex

vals = np.asarray(values)

# localize to UTC
is_datetimetz = com.is_datetimetz(values)
if is_datetimetz:
values = DatetimeIndex(values)
vals = values.tz_localize(None)

is_datetime = com.is_datetime64_dtype(vals)
is_timedelta = com.is_timedelta64_dtype(vals)
(hash_klass, vec_klass), vals = _get_data_algo(vals, _hashtables)

table = hash_klass(size_hint or len(vals))
uniques = vec_klass()
labels = table.get_labels(vals, uniques, 0, na_sentinel)
labels = table.get_labels(vals, uniques, 0, na_sentinel, True)

labels = com._ensure_platform_int(labels)

Expand Down Expand Up @@ -224,7 +231,12 @@ def factorize(values, sort=False, order=None, na_sentinel=-1, size_hint=None):

uniques = uniques.take(sorter)

if is_datetime:
if is_datetimetz:

# reset tz
uniques = DatetimeIndex(uniques.astype('M8[ns]')).tz_localize(
values.tz)
elif is_datetime:
uniques = uniques.astype('M8[ns]')
elif is_timedelta:
uniques = uniques.astype('m8[ns]')
Expand Down Expand Up @@ -296,7 +308,6 @@ def value_counts(values, sort=True, ascending=False, normalize=False,
keys, counts = htable.value_count_scalar64(values, dropna)

if dropna:
from pandas.tslib import iNaT
msk = keys != iNaT
keys, counts = keys[msk], counts[msk]

Expand Down Expand Up @@ -478,22 +489,13 @@ def _interpolate(a, b, fraction):


def _get_data_algo(values, func_map):
mask = None
if com.is_float_dtype(values):
f = func_map['float64']
values = com._ensure_float64(values)

elif com.needs_i8_conversion(values):

# if we have NaT, punt to object dtype
mask = com.isnull(values)
if mask.ravel().any():
f = func_map['generic']
values = com._ensure_object(values)
values[mask] = np.nan
else:
f = func_map['int64']
values = values.view('i8')
f = func_map['int64']
values = values.view('i8')

elif com.is_integer_dtype(values):
f = func_map['int64']
Expand Down
4 changes: 2 additions & 2 deletions pandas/core/categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -257,7 +257,7 @@ def __init__(self, values, categories=None, ordered=False, name=None,
categories = values.categories
values = values.__array__()

elif isinstance(values, ABCIndexClass):
elif isinstance(values, (ABCIndexClass, ABCSeries)):
pass

else:
Expand Down Expand Up @@ -1177,7 +1177,7 @@ def get_values(self):
"""
# if we are a datetime and period index, return Index to keep metadata
if com.is_datetimelike(self.categories):
return self.categories.take(self._codes)
return self.categories.take(self._codes, fill_value=np.nan)
return np.array(self)

def check_for_ordered(self, op):
Expand Down
34 changes: 22 additions & 12 deletions pandas/hashtable.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -377,12 +377,13 @@ cdef class Int64HashTable(HashTable):

def factorize(self, ndarray[object] values):
reverse = {}
labels = self.get_labels(values, reverse, 0)
labels = self.get_labels(values, reverse, 0, 0)
return reverse, labels

@cython.boundscheck(False)
def get_labels(self, int64_t[:] values, Int64Vector uniques,
Py_ssize_t count_prior, Py_ssize_t na_sentinel):
Py_ssize_t count_prior, Py_ssize_t na_sentinel,
bint check_null=True):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In what situation would check_null=False?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

in theory I should not be checking this for a pure int array (as opposed to a view of a datetimelike).
Or for an integer array that is not int64. neither conditons we really support now.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I see. well, this is fine then

cdef:
Py_ssize_t i, n = len(values)
int64_t[:] labels
Expand All @@ -399,6 +400,11 @@ cdef class Int64HashTable(HashTable):
for i in range(n):
val = values[i]
k = kh_get_int64(self.table, val)

if check_null and val == iNaT:
labels[i] = na_sentinel
continue

if k != self.table.n_buckets:
idx = self.table.vals[k]
labels[i] = idx
Expand Down Expand Up @@ -525,13 +531,14 @@ cdef class Float64HashTable(HashTable):

def factorize(self, float64_t[:] values):
uniques = Float64Vector()
labels = self.get_labels(values, uniques, 0, -1)
labels = self.get_labels(values, uniques, 0, -1, 1)
return uniques.to_array(), labels

@cython.boundscheck(False)
def get_labels(self, float64_t[:] values,
Float64Vector uniques,
Py_ssize_t count_prior, int64_t na_sentinel):
Float64Vector uniques,
Py_ssize_t count_prior, int64_t na_sentinel,
bint check_null=True):
cdef:
Py_ssize_t i, n = len(values)
int64_t[:] labels
Expand All @@ -548,7 +555,7 @@ cdef class Float64HashTable(HashTable):
for i in range(n):
val = values[i]

if val != val:
if check_null and val != val:
labels[i] = na_sentinel
continue

Expand Down Expand Up @@ -762,7 +769,8 @@ cdef class PyObjectHashTable(HashTable):
return uniques.to_array()

def get_labels(self, ndarray[object] values, ObjectVector uniques,
Py_ssize_t count_prior, int64_t na_sentinel):
Py_ssize_t count_prior, int64_t na_sentinel,
bint check_null=True):
cdef:
Py_ssize_t i, n = len(values)
int64_t[:] labels
Expand All @@ -777,7 +785,7 @@ cdef class PyObjectHashTable(HashTable):
val = values[i]
hash(val)

if val != val or val is None:
if check_null and val != val or val is None:
labels[i] = na_sentinel
continue

Expand Down Expand Up @@ -808,14 +816,15 @@ cdef class Factorizer:
def get_count(self):
return self.count

def factorize(self, ndarray[object] values, sort=False, na_sentinel=-1):
def factorize(self, ndarray[object] values, sort=False, na_sentinel=-1,
check_null=True):
"""
Factorize values with nans replaced by na_sentinel
>>> factorize(np.array([1,2,np.nan], dtype='O'), na_sentinel=20)
array([ 0, 1, 20])
"""
labels = self.table.get_labels(values, self.uniques,
self.count, na_sentinel)
self.count, na_sentinel, check_null)
mask = (labels == na_sentinel)
# sort on
if sort:
Expand Down Expand Up @@ -848,9 +857,10 @@ cdef class Int64Factorizer:
return self.count

def factorize(self, int64_t[:] values, sort=False,
na_sentinel=-1):
na_sentinel=-1, check_null=True):
labels = self.table.get_labels(values, self.uniques,
self.count, na_sentinel)
self.count, na_sentinel,
check_null)

# sort on
if sort:
Expand Down
7 changes: 6 additions & 1 deletion pandas/tests/frame/test_analytics.py
Original file line number Diff line number Diff line change
Expand Up @@ -787,7 +787,12 @@ def test_rank2(self):
# check the rank
expected = DataFrame([[2., nan, 1.],
[2., 3., 1.]])
result = df.rank(1, numeric_only=False)
result = df.rank(1, numeric_only=False, ascending=True)
assert_frame_equal(result, expected)

expected = DataFrame([[1., nan, 2.],
[2., 1., 3.]])
result = df.rank(1, numeric_only=False, ascending=False)
assert_frame_equal(result, expected)

# mixed-type frames
Expand Down
32 changes: 32 additions & 0 deletions pandas/tests/test_categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -308,6 +308,38 @@ def test_constructor_with_generator(self):
cat = pd.Categorical([0, 1, 2], categories=xrange(3))
self.assertTrue(cat.equals(exp))

def test_constructor_with_datetimelike(self):

# 12077
# constructor wwth a datetimelike and NaT

for dtl in [pd.date_range('1995-01-01 00:00:00',
periods=5, freq='s'),
pd.date_range('1995-01-01 00:00:00',
periods=5, freq='s', tz='US/Eastern'),
pd.timedelta_range('1 day', periods=5, freq='s')]:

s = Series(dtl)
c = Categorical(s)
expected = type(dtl)(s)
expected.freq = None
tm.assert_index_equal(c.categories, expected)
self.assert_numpy_array_equal(c.codes, np.arange(5, dtype='int8'))

# with NaT
s2 = s.copy()
s2.iloc[-1] = pd.NaT
c = Categorical(s2)
expected = type(dtl)(s2.dropna())
expected.freq = None
tm.assert_index_equal(c.categories, expected)
self.assert_numpy_array_equal(c.codes,
np.concatenate([np.arange(4, dtype='int8'),
[-1]]))

result = repr(c)
self.assertTrue('NaT' in result)

def test_from_codes(self):

# too few categories
Expand Down
2 changes: 1 addition & 1 deletion pandas/tseries/period.py
Original file line number Diff line number Diff line change
Expand Up @@ -832,7 +832,7 @@ def _format_native_types(self, na_rep=u('NaT'), date_format=None,
values[imask] = np.array([formatter(dt) for dt in values[imask]])
return values

def take(self, indices, axis=0):
def take(self, indices, axis=0, allow_fill=True, fill_value=None):
"""
Analogous to ndarray.take
"""
Expand Down