Skip to content

Commit 0c19941

Browse files
committed
Merge pull request pandas-dev#10597 from cpcloud/cat-perf
Improve categorical concat speed by ~20x
2 parents c740bb0 + 6213fb3 commit 0c19941

File tree

5 files changed

+68
-44
lines changed

5 files changed

+68
-44
lines changed

doc/source/whatsnew/v0.17.0.txt

+1
Original file line numberDiff line numberDiff line change
@@ -276,6 +276,7 @@ Performance Improvements
276276
- 8x improvement in ``timedelta64`` and ``datetime64`` ops (:issue:`6755`)
277277
- Significantly improved performance of indexing ``MultiIndex`` with slicers (:issue:`10287`)
278278
- Improved performance of ``Series.isin`` for datetimelike/integer Series (:issue:`10287`)
279+
- 20x improvement in ``concat`` of Categoricals when categories are identical (:issue:`10587`)
279280

280281
.. _whatsnew_0170.bug_fixes:
281282

pandas/core/categorical.py

+30-25
Original file line numberDiff line numberDiff line change
@@ -1715,18 +1715,20 @@ def _convert_to_list_like(list_like):
17151715
return [list_like]
17161716

17171717
def _concat_compat(to_concat, axis=0):
1718-
"""
1719-
provide concatenation of an object/categorical array of arrays each of which is a single dtype
1718+
"""Concatenate an object/categorical array of arrays, each of which is a
1719+
single dtype
17201720
17211721
Parameters
17221722
----------
17231723
to_concat : array of arrays
1724-
axis : axis to provide concatenation
1725-
in the current impl this is always 0, e.g. we only have 1-d categoricals
1724+
axis : int
1725+
Axis to provide concatenation in the current implementation this is
1726+
always 0, e.g. we only have 1D categoricals
17261727
17271728
Returns
17281729
-------
1729-
a single array, preserving the combined dtypes
1730+
Categorical
1731+
A single array, preserving the combined dtypes
17301732
"""
17311733

17321734
def convert_categorical(x):
@@ -1735,31 +1737,34 @@ def convert_categorical(x):
17351737
return x.get_values()
17361738
return x.ravel()
17371739

1738-
typs = get_dtype_kinds(to_concat)
1739-
if not len(typs-set(['object','category'])):
1740-
1741-
# we only can deal with object & category types
1742-
pass
1743-
1744-
else:
1745-
1740+
if get_dtype_kinds(to_concat) - set(['object', 'category']):
17461741
# convert to object type and perform a regular concat
17471742
from pandas.core.common import _concat_compat
1748-
return _concat_compat([ np.array(x,copy=False).astype('object') for x in to_concat ],axis=0)
1743+
return _concat_compat([np.array(x, copy=False, dtype=object)
1744+
for x in to_concat], axis=0)
17491745

1750-
# we could have object blocks and categorical's here
1751-
# if we only have a single cateogoricals then combine everything
1746+
# we could have object blocks and categoricals here
1747+
# if we only have a single categoricals then combine everything
17521748
# else its a non-compat categorical
1753-
categoricals = [ x for x in to_concat if is_categorical_dtype(x.dtype) ]
1754-
objects = [ x for x in to_concat if is_object_dtype(x.dtype) ]
1749+
categoricals = [x for x in to_concat if is_categorical_dtype(x.dtype)]
17551750

17561751
# validate the categories
1757-
categories = None
1758-
for x in categoricals:
1759-
if categories is None:
1760-
categories = x.categories
1761-
if not categories.equals(x.categories):
1752+
categories = categoricals[0]
1753+
rawcats = categories.categories
1754+
for x in categoricals[1:]:
1755+
if not categories.is_dtype_equal(x):
17621756
raise ValueError("incompatible categories in categorical concat")
17631757

1764-
# concat them
1765-
return Categorical(np.concatenate([ convert_categorical(x) for x in to_concat ],axis=0), categories=categories)
1758+
# we've already checked that all categoricals are the same, so if their
1759+
# length is equal to the input then we have all the same categories
1760+
if len(categoricals) == len(to_concat):
1761+
# concating numeric types is much faster than concating object types
1762+
# and fastpath takes a shorter path through the constructor
1763+
return Categorical(np.concatenate([x.codes for x in to_concat], axis=0),
1764+
rawcats,
1765+
ordered=categoricals[0].ordered,
1766+
fastpath=True)
1767+
else:
1768+
concatted = np.concatenate(list(map(convert_categorical, to_concat)),
1769+
axis=0)
1770+
return Categorical(concatted, rawcats)

pandas/core/internals.py

+5-1
Original file line numberDiff line numberDiff line change
@@ -4388,7 +4388,11 @@ def is_null(self):
43884388
# Usually it's enough to check but a small fraction of values to see if
43894389
# a block is NOT null, chunks should help in such cases. 1000 value
43904390
# was chosen rather arbitrarily.
4391-
values_flat = self.block.values.ravel()
4391+
values = self.block.values
4392+
if self.block.is_categorical:
4393+
values_flat = values.categories
4394+
else:
4395+
values_flat = values.ravel()
43924396
total_len = values_flat.shape[0]
43934397
chunk_len = max(total_len // 40, 1000)
43944398
for i in range(0, total_len, chunk_len):

pandas/lib.pyx

+16-18
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ from cpython cimport (PyDict_New, PyDict_GetItem, PyDict_SetItem,
2121
PyTuple_SetItem,
2222
PyTuple_New,
2323
PyObject_SetAttrString,
24+
PyObject_RichCompareBool,
2425
PyBytes_GET_SIZE,
2526
PyUnicode_GET_SIZE)
2627

@@ -372,19 +373,19 @@ def isnullobj2d_old(ndarray[object, ndim=2] arr):
372373
result[i, j] = 1
373374
return result.view(np.bool_)
374375

375-
def list_to_object_array(list obj):
376+
377+
@cython.wraparound(False)
378+
@cython.boundscheck(False)
379+
cpdef ndarray[object] list_to_object_array(list obj):
376380
'''
377381
Convert list to object ndarray. Seriously can't believe I had to write this
378382
function
379383
'''
380384
cdef:
381-
Py_ssize_t i, n
382-
ndarray[object] arr
383-
384-
n = len(obj)
385-
arr = np.empty(n, dtype=object)
385+
Py_ssize_t i, n = len(obj)
386+
ndarray[object] arr = np.empty(n, dtype=object)
386387

387-
for i from 0 <= i < n:
388+
for i in range(n):
388389
arr[i] = obj[i]
389390

390391
return arr
@@ -732,28 +733,25 @@ def scalar_compare(ndarray[object] values, object val, object op):
732733

733734
return result.view(bool)
734735

736+
735737
@cython.wraparound(False)
736738
@cython.boundscheck(False)
737-
def array_equivalent_object(ndarray[object] left, ndarray[object] right):
739+
cpdef bint array_equivalent_object(object[:] left, object[:] right):
738740
""" perform an element by element comparion on 1-d object arrays
739741
taking into account nan positions """
740-
cdef Py_ssize_t i, n
741-
cdef object x, y
742+
cdef:
743+
Py_ssize_t i, n = left.shape[0]
744+
object x, y
742745

743-
n = len(left)
744-
for i from 0 <= i < n:
746+
for i in range(n):
745747
x = left[i]
746748
y = right[i]
747749

748750
# we are either not equal or both nan
749751
# I think None == None will be true here
750-
if cpython.PyObject_RichCompareBool(x, y, cpython.Py_EQ):
751-
continue
752-
elif _checknull(x) and _checknull(y):
753-
continue
754-
else:
752+
if not (PyObject_RichCompareBool(x, y, cpython.Py_EQ) or
753+
_checknull(x) and _checknull(y)):
755754
return False
756-
757755
return True
758756

759757

vb_suite/categoricals.py

+16
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
from vbench.benchmark import Benchmark
2+
from datetime import datetime
3+
4+
common_setup = """from pandas_vb_common import *
5+
"""
6+
7+
#----------------------------------------------------------------------
8+
# Series constructors
9+
10+
setup = common_setup + """
11+
s = pd.Series(list('aabbcd') * 1000000).astype('category')
12+
"""
13+
14+
concat_categorical = \
15+
Benchmark("concat([s, s])", setup=setup, name='concat_categorical',
16+
start_date=datetime(year=2015, month=7, day=15))

0 commit comments

Comments
 (0)