diff --git a/doc/source/release.rst b/doc/source/release.rst
index a6aa842940bc0..2151407b8c2fd 100644
--- a/doc/source/release.rst
+++ b/doc/source/release.rst
@@ -313,6 +313,8 @@ Improvements to existing features
   in item handling (:issue:`6745`, :issue:`6988`).
 - Improve performance in certain reindexing operations by optimizing ``take_2d`` (:issue:`6749`)
 - Arrays of strings can be wrapped to a specified width (``str.wrap``) (:issue:`6999`)
+- ``GroupBy.count()`` is now implemented in Cython and is much faster for large
+  numbers of groups (:issue:`7016`).
 
 .. _release.bug_fixes-0.14.0:
 
diff --git a/doc/source/v0.14.0.txt b/doc/source/v0.14.0.txt
index b5df39df3b617..f9f9e5d5e4ad3 100644
--- a/doc/source/v0.14.0.txt
+++ b/doc/source/v0.14.0.txt
@@ -568,6 +568,8 @@ Performance
 - Performance improvements in timedelta conversions for integer dtypes (:issue:`6754`)
 - Improved performance of compatible pickles (:issue:`6899`)
 - Improve performance in certain reindexing operations by optimizing ``take_2d`` (:issue:`6749`)
+- ``GroupBy.count()`` is now implemented in Cython and is much faster for large
+  numbers of groups (:issue:`7016`).
 
 Experimental
 ~~~~~~~~~~~~
diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py
index ce64ed754180d..400f7e06df784 100644
--- a/pandas/core/groupby.py
+++ b/pandas/core/groupby.py
@@ -5,7 +5,7 @@
 import collections
 
 from pandas.compat import(
-    zip, builtins, range, long, lrange, lzip,
+    zip, builtins, range, long, lzip,
     OrderedDict, callable
 )
 from pandas import compat
@@ -713,15 +713,6 @@ def size(self):
         """
         return self.grouper.size()
 
-    def count(self, axis=0):
-        """
-        Number of non-null items in each group.
-        axis : axis number, default 0
-               the grouping axis
-        """
-        self._set_selection_from_grouper()
-        return self._python_agg_general(lambda x: notnull(x).sum(axis=axis)).astype('int64')
-
     sum = _groupby_function('sum', 'add', np.sum)
     prod = _groupby_function('prod', 'prod', np.prod)
     min = _groupby_function('min', 'min', np.min, numeric_only=False)
@@ -731,6 +722,12 @@ def count(self, axis=0):
     last = _groupby_function('last', 'last', _last_compat, numeric_only=False,
                              _convert=True)
 
+    _count = _groupby_function('_count', 'count',
+                               lambda x, axis=0: notnull(x).sum(axis=axis),
+                               numeric_only=False)
+
+    def count(self, axis=0):
+        return self._count().astype('int64')
 
     def ohlc(self):
         """
@@ -1318,10 +1315,11 @@ def get_group_levels(self):
             'f': lambda func, a, b, c, d: func(a, b, c, d, 1)
         },
         'last': 'group_last',
+        'count': 'group_count',
     }
 
     _cython_transforms = {
-        'std': np.sqrt
+        'std': np.sqrt,
     }
 
     _cython_arity = {
@@ -1390,14 +1388,16 @@ def aggregate(self, values, how, axis=0):
             values = com.ensure_float(values)
             is_numeric = True
         else:
-            if issubclass(values.dtype.type, np.datetime64):
-                raise Exception('Cython not able to handle this case')
-
-            values = values.astype(object)
-            is_numeric = False
+            is_numeric = issubclass(values.dtype.type, (np.datetime64,
+                                                        np.timedelta64))
+            if is_numeric:
+                values = values.view('int64')
+            else:
+                values = values.astype(object)
 
         # will be filled in Cython function
-        result = np.empty(out_shape, dtype=values.dtype)
+        result = np.empty(out_shape,
+                          dtype=np.dtype('f%d' % values.dtype.itemsize))
         result.fill(np.nan)
         counts = np.zeros(self.ngroups, dtype=np.int64)
 
@@ -1405,10 +1405,10 @@ def aggregate(self, values, how, axis=0):
 
         if self._filter_empty_groups:
             if result.ndim == 2:
-                if is_numeric:
+                try:
                     result = lib.row_bool_subset(
                         result, (counts > 0).view(np.uint8))
-                else:
+                except ValueError:
                     result = lib.row_bool_subset_object(
                         result, (counts > 0).view(np.uint8))
             else:
@@ -1442,6 +1442,7 @@ def _aggregate(self, result, counts, values, how, is_numeric):
                 chunk = chunk.squeeze()
                 agg_func(result[:, :, i], counts, chunk, comp_ids)
         else:
+            #import ipdb; ipdb.set_trace()  # XXX BREAKPOINT
             agg_func(result, counts, values, comp_ids)
 
         return trans_func(result)
@@ -1651,6 +1652,7 @@ def names(self):
             'f': lambda func, a, b, c, d: func(a, b, c, d, 1)
         },
         'last': 'group_last_bin',
+        'count': 'group_count_bin',
     }
 
     _name_functions = {
diff --git a/pandas/src/generate_code.py b/pandas/src/generate_code.py
index 6d45a82fb7175..53754a899adf8 100644
--- a/pandas/src/generate_code.py
+++ b/pandas/src/generate_code.py
@@ -3,7 +3,6 @@
 # don't introduce a pandas/pandas.compat import
 # or we get a bootstrapping problem
 from StringIO import StringIO
-import os
 
 header = """
 cimport numpy as np
@@ -34,7 +33,9 @@
 ctypedef unsigned char UChar
 
 cimport util
-from util cimport is_array, _checknull, _checknan
+from util cimport is_array, _checknull, _checknan, get_nat
+
+cdef int64_t iNaT = get_nat()
 
 # import datetime C API
 PyDateTime_IMPORT
@@ -1150,6 +1151,79 @@ def group_var_bin_%(name)s(ndarray[%(dest_type2)s, ndim=2] out,
                              (ct * ct - ct))
 """
 
+group_count_template = """@cython.boundscheck(False)
+@cython.wraparound(False)
+def group_count_%(name)s(ndarray[%(dest_type2)s, ndim=2] out,
+                         ndarray[int64_t] counts,
+                         ndarray[%(c_type)s, ndim=2] values,
+                         ndarray[int64_t] labels):
+    '''
+    Only aggregates on axis=0
+    '''
+    cdef:
+        Py_ssize_t i, j, lab
+        Py_ssize_t N = values.shape[0], K = values.shape[1]
+        %(c_type)s val
+        ndarray[int64_t, ndim=2] nobs = np.zeros((out.shape[0], out.shape[1]),
+                                                 dtype=np.int64)
+
+    if len(values) != len(labels):
+       raise AssertionError("len(index) != len(labels)")
+
+    for i in range(N):
+        lab = labels[i]
+        if lab < 0:
+            continue
+
+        counts[lab] += 1
+        for j in range(K):
+            val = values[i, j]
+
+            # not nan
+            nobs[lab, j] += val == val and val != iNaT
+
+    for i in range(len(counts)):
+        for j in range(K):
+            out[i, j] = nobs[i, j]
+
+
+"""
+
+group_count_bin_template = """@cython.boundscheck(False)
+@cython.wraparound(False)
+def group_count_bin_%(name)s(ndarray[%(dest_type2)s, ndim=2] out,
+                             ndarray[int64_t] counts,
+                             ndarray[%(c_type)s, ndim=2] values,
+                             ndarray[int64_t] bins):
+    '''
+    Only aggregates on axis=0
+    '''
+    cdef:
+        Py_ssize_t i, j, ngroups
+        Py_ssize_t N = values.shape[0], K = values.shape[1], b = 0
+        %(c_type)s val
+        ndarray[int64_t, ndim=2] nobs = np.zeros((out.shape[0], out.shape[1]),
+                                                 dtype=np.int64)
+
+    ngroups = len(bins) + (bins[len(bins) - 1] != N)
+
+    for i in range(N):
+        while b < ngroups - 1 and i >= bins[b]:
+            b += 1
+
+        counts[b] += 1
+        for j in range(K):
+            val = values[i, j]
+
+            # not nan
+            nobs[b, j] += val == val and val != iNaT
+
+    for i in range(ngroups):
+        for j in range(K):
+            out[i, j] = nobs[i, j]
+
+
+"""
 # add passing bin edges, instead of labels
 
 
@@ -2145,7 +2219,8 @@ def put2d_%(name)s_%(dest_type)s(ndarray[%(c_type)s, ndim=2, cast=True] values,
 #-------------------------------------------------------------------------
 # Generators
 
-def generate_put_template(template, use_ints = True, use_floats = True):
+def generate_put_template(template, use_ints = True, use_floats = True,
+                          use_objects=False):
     floats_list = [
         ('float64', 'float64_t', 'float64_t', 'np.float64'),
         ('float32', 'float32_t', 'float32_t', 'np.float32'),
@@ -2156,11 +2231,14 @@ def generate_put_template(template, use_ints = True, use_floats = True):
         ('int32', 'int32_t', 'float64_t', 'np.float64'),
         ('int64', 'int64_t', 'float64_t', 'np.float64'),
         ]
+    object_list = [('object', 'object', 'float64_t', 'np.float64')]
     function_list = []
     if use_floats:
         function_list.extend(floats_list)
     if use_ints:
         function_list.extend(ints_list)
+    if use_objects:
+        function_list.extend(object_list)
 
     output = StringIO()
     for name, c_type, dest_type, dest_dtype in function_list:
@@ -2251,6 +2329,8 @@ def generate_from_template(template, exclude=None):
             group_max_bin_template,
             group_ohlc_template]
 
+groupby_count = [group_count_template, group_count_bin_template]
+
 templates_1d = [map_indices_template,
                 pad_template,
                 backfill_template,
@@ -2272,6 +2352,7 @@ def generate_from_template(template, exclude=None):
                   take_2d_axis1_template,
                   take_2d_multi_template]
 
+
 def generate_take_cython_file(path='generated.pyx'):
     with open(path, 'w') as f:
         print(header, file=f)
@@ -2288,7 +2369,10 @@ def generate_take_cython_file(path='generated.pyx'):
             print(generate_put_template(template), file=f)
 
         for template in groupbys:
-            print(generate_put_template(template, use_ints = False), file=f)
+            print(generate_put_template(template, use_ints=False), file=f)
+
+        for template in groupby_count:
+            print(generate_put_template(template, use_objects=True), file=f)
 
         # for template in templates_1d_datetime:
         #     print >> f, generate_from_template_datetime(template)
@@ -2299,5 +2383,6 @@ def generate_take_cython_file(path='generated.pyx'):
         for template in nobool_1d_templates:
             print(generate_from_template(template, exclude=['bool']), file=f)
 
+
 if __name__ == '__main__':
     generate_take_cython_file()
diff --git a/pandas/src/generated.pyx b/pandas/src/generated.pyx
index 68bda2957fb55..26c6f3daf0e0a 100644
--- a/pandas/src/generated.pyx
+++ b/pandas/src/generated.pyx
@@ -27,7 +27,9 @@ from khash cimport *
 ctypedef unsigned char UChar
 
 cimport util
-from util cimport is_array, _checknull, _checknan
+from util cimport is_array, _checknull, _checknan, get_nat
+
+cdef int64_t iNaT = get_nat()
 
 # import datetime C API
 PyDateTime_IMPORT
@@ -6621,6 +6623,498 @@ def group_ohlc_float32(ndarray[float32_t, ndim=2] out,
             out[b, 2] = vlow
             out[b, 3] = vclose
 
+@cython.boundscheck(False)
+@cython.wraparound(False)
+def group_count_float64(ndarray[float64_t, ndim=2] out,
+                         ndarray[int64_t] counts,
+                         ndarray[float64_t, ndim=2] values,
+                         ndarray[int64_t] labels):
+    '''
+    Only aggregates on axis=0
+    '''
+    cdef:
+        Py_ssize_t i, j, lab
+        Py_ssize_t N = values.shape[0], K = values.shape[1]
+        float64_t val
+        ndarray[int64_t, ndim=2] nobs = np.zeros((out.shape[0], out.shape[1]),
+                                                 dtype=np.int64)
+
+    if len(values) != len(labels):
+       raise AssertionError("len(index) != len(labels)")
+
+    for i in range(N):
+        lab = labels[i]
+        if lab < 0:
+            continue
+
+        counts[lab] += 1
+        for j in range(K):
+            val = values[i, j]
+
+            # not nan
+            nobs[lab, j] += val == val and val != iNaT
+
+    for i in range(len(counts)):
+        for j in range(K):
+            out[i, j] = nobs[i, j]
+
+
+@cython.boundscheck(False)
+@cython.wraparound(False)
+def group_count_float32(ndarray[float32_t, ndim=2] out,
+                         ndarray[int64_t] counts,
+                         ndarray[float32_t, ndim=2] values,
+                         ndarray[int64_t] labels):
+    '''
+    Only aggregates on axis=0
+    '''
+    cdef:
+        Py_ssize_t i, j, lab
+        Py_ssize_t N = values.shape[0], K = values.shape[1]
+        float32_t val
+        ndarray[int64_t, ndim=2] nobs = np.zeros((out.shape[0], out.shape[1]),
+                                                 dtype=np.int64)
+
+    if len(values) != len(labels):
+       raise AssertionError("len(index) != len(labels)")
+
+    for i in range(N):
+        lab = labels[i]
+        if lab < 0:
+            continue
+
+        counts[lab] += 1
+        for j in range(K):
+            val = values[i, j]
+
+            # not nan
+            nobs[lab, j] += val == val and val != iNaT
+
+    for i in range(len(counts)):
+        for j in range(K):
+            out[i, j] = nobs[i, j]
+
+
+@cython.boundscheck(False)
+@cython.wraparound(False)
+def group_count_int8(ndarray[float32_t, ndim=2] out,
+                         ndarray[int64_t] counts,
+                         ndarray[int8_t, ndim=2] values,
+                         ndarray[int64_t] labels):
+    '''
+    Only aggregates on axis=0
+    '''
+    cdef:
+        Py_ssize_t i, j, lab
+        Py_ssize_t N = values.shape[0], K = values.shape[1]
+        int8_t val
+        ndarray[int64_t, ndim=2] nobs = np.zeros((out.shape[0], out.shape[1]),
+                                                 dtype=np.int64)
+
+    if len(values) != len(labels):
+       raise AssertionError("len(index) != len(labels)")
+
+    for i in range(N):
+        lab = labels[i]
+        if lab < 0:
+            continue
+
+        counts[lab] += 1
+        for j in range(K):
+            val = values[i, j]
+
+            # not nan
+            nobs[lab, j] += val == val and val != iNaT
+
+    for i in range(len(counts)):
+        for j in range(K):
+            out[i, j] = nobs[i, j]
+
+
+@cython.boundscheck(False)
+@cython.wraparound(False)
+def group_count_int16(ndarray[float32_t, ndim=2] out,
+                         ndarray[int64_t] counts,
+                         ndarray[int16_t, ndim=2] values,
+                         ndarray[int64_t] labels):
+    '''
+    Only aggregates on axis=0
+    '''
+    cdef:
+        Py_ssize_t i, j, lab
+        Py_ssize_t N = values.shape[0], K = values.shape[1]
+        int16_t val
+        ndarray[int64_t, ndim=2] nobs = np.zeros((out.shape[0], out.shape[1]),
+                                                 dtype=np.int64)
+
+    if len(values) != len(labels):
+       raise AssertionError("len(index) != len(labels)")
+
+    for i in range(N):
+        lab = labels[i]
+        if lab < 0:
+            continue
+
+        counts[lab] += 1
+        for j in range(K):
+            val = values[i, j]
+
+            # not nan
+            nobs[lab, j] += val == val and val != iNaT
+
+    for i in range(len(counts)):
+        for j in range(K):
+            out[i, j] = nobs[i, j]
+
+
+@cython.boundscheck(False)
+@cython.wraparound(False)
+def group_count_int32(ndarray[float64_t, ndim=2] out,
+                         ndarray[int64_t] counts,
+                         ndarray[int32_t, ndim=2] values,
+                         ndarray[int64_t] labels):
+    '''
+    Only aggregates on axis=0
+    '''
+    cdef:
+        Py_ssize_t i, j, lab
+        Py_ssize_t N = values.shape[0], K = values.shape[1]
+        int32_t val
+        ndarray[int64_t, ndim=2] nobs = np.zeros((out.shape[0], out.shape[1]),
+                                                 dtype=np.int64)
+
+    if len(values) != len(labels):
+       raise AssertionError("len(index) != len(labels)")
+
+    for i in range(N):
+        lab = labels[i]
+        if lab < 0:
+            continue
+
+        counts[lab] += 1
+        for j in range(K):
+            val = values[i, j]
+
+            # not nan
+            nobs[lab, j] += val == val and val != iNaT
+
+    for i in range(len(counts)):
+        for j in range(K):
+            out[i, j] = nobs[i, j]
+
+
+@cython.boundscheck(False)
+@cython.wraparound(False)
+def group_count_int64(ndarray[float64_t, ndim=2] out,
+                         ndarray[int64_t] counts,
+                         ndarray[int64_t, ndim=2] values,
+                         ndarray[int64_t] labels):
+    '''
+    Only aggregates on axis=0
+    '''
+    cdef:
+        Py_ssize_t i, j, lab
+        Py_ssize_t N = values.shape[0], K = values.shape[1]
+        int64_t val
+        ndarray[int64_t, ndim=2] nobs = np.zeros((out.shape[0], out.shape[1]),
+                                                 dtype=np.int64)
+
+    if len(values) != len(labels):
+       raise AssertionError("len(index) != len(labels)")
+
+    for i in range(N):
+        lab = labels[i]
+        if lab < 0:
+            continue
+
+        counts[lab] += 1
+        for j in range(K):
+            val = values[i, j]
+
+            # not nan
+            nobs[lab, j] += val == val and val != iNaT
+
+    for i in range(len(counts)):
+        for j in range(K):
+            out[i, j] = nobs[i, j]
+
+
+@cython.boundscheck(False)
+@cython.wraparound(False)
+def group_count_object(ndarray[float64_t, ndim=2] out,
+                         ndarray[int64_t] counts,
+                         ndarray[object, ndim=2] values,
+                         ndarray[int64_t] labels):
+    '''
+    Only aggregates on axis=0
+    '''
+    cdef:
+        Py_ssize_t i, j, lab
+        Py_ssize_t N = values.shape[0], K = values.shape[1]
+        object val
+        ndarray[int64_t, ndim=2] nobs = np.zeros((out.shape[0], out.shape[1]),
+                                                 dtype=np.int64)
+
+    if len(values) != len(labels):
+       raise AssertionError("len(index) != len(labels)")
+
+    for i in range(N):
+        lab = labels[i]
+        if lab < 0:
+            continue
+
+        counts[lab] += 1
+        for j in range(K):
+            val = values[i, j]
+
+            # not nan
+            nobs[lab, j] += val == val and val != iNaT
+
+    for i in range(len(counts)):
+        for j in range(K):
+            out[i, j] = nobs[i, j]
+
+
+
+@cython.boundscheck(False)
+@cython.wraparound(False)
+def group_count_bin_float64(ndarray[float64_t, ndim=2] out,
+                             ndarray[int64_t] counts,
+                             ndarray[float64_t, ndim=2] values,
+                             ndarray[int64_t] bins):
+    '''
+    Only aggregates on axis=0
+    '''
+    cdef:
+        Py_ssize_t i, j, ngroups
+        Py_ssize_t N = values.shape[0], K = values.shape[1], b = 0
+        float64_t val
+        ndarray[int64_t, ndim=2] nobs = np.zeros((out.shape[0], out.shape[1]),
+                                                 dtype=np.int64)
+
+    ngroups = len(bins) + (bins[len(bins) - 1] != N)
+
+    for i in range(N):
+        while b < ngroups - 1 and i >= bins[b]:
+            b += 1
+
+        counts[b] += 1
+        for j in range(K):
+            val = values[i, j]
+
+            # not nan
+            nobs[b, j] += val == val and val != iNaT
+
+    for i in range(ngroups):
+        for j in range(K):
+            out[i, j] = nobs[i, j]
+
+
+@cython.boundscheck(False)
+@cython.wraparound(False)
+def group_count_bin_float32(ndarray[float32_t, ndim=2] out,
+                             ndarray[int64_t] counts,
+                             ndarray[float32_t, ndim=2] values,
+                             ndarray[int64_t] bins):
+    '''
+    Only aggregates on axis=0
+    '''
+    cdef:
+        Py_ssize_t i, j, ngroups
+        Py_ssize_t N = values.shape[0], K = values.shape[1], b = 0
+        float32_t val
+        ndarray[int64_t, ndim=2] nobs = np.zeros((out.shape[0], out.shape[1]),
+                                                 dtype=np.int64)
+
+    ngroups = len(bins) + (bins[len(bins) - 1] != N)
+
+    for i in range(N):
+        while b < ngroups - 1 and i >= bins[b]:
+            b += 1
+
+        counts[b] += 1
+        for j in range(K):
+            val = values[i, j]
+
+            # not nan
+            nobs[b, j] += val == val and val != iNaT
+
+    for i in range(ngroups):
+        for j in range(K):
+            out[i, j] = nobs[i, j]
+
+
+@cython.boundscheck(False)
+@cython.wraparound(False)
+def group_count_bin_int8(ndarray[float32_t, ndim=2] out,
+                             ndarray[int64_t] counts,
+                             ndarray[int8_t, ndim=2] values,
+                             ndarray[int64_t] bins):
+    '''
+    Only aggregates on axis=0
+    '''
+    cdef:
+        Py_ssize_t i, j, ngroups
+        Py_ssize_t N = values.shape[0], K = values.shape[1], b = 0
+        int8_t val
+        ndarray[int64_t, ndim=2] nobs = np.zeros((out.shape[0], out.shape[1]),
+                                                 dtype=np.int64)
+
+    ngroups = len(bins) + (bins[len(bins) - 1] != N)
+
+    for i in range(N):
+        while b < ngroups - 1 and i >= bins[b]:
+            b += 1
+
+        counts[b] += 1
+        for j in range(K):
+            val = values[i, j]
+
+            # not nan
+            nobs[b, j] += val == val and val != iNaT
+
+    for i in range(ngroups):
+        for j in range(K):
+            out[i, j] = nobs[i, j]
+
+
+@cython.boundscheck(False)
+@cython.wraparound(False)
+def group_count_bin_int16(ndarray[float32_t, ndim=2] out,
+                             ndarray[int64_t] counts,
+                             ndarray[int16_t, ndim=2] values,
+                             ndarray[int64_t] bins):
+    '''
+    Only aggregates on axis=0
+    '''
+    cdef:
+        Py_ssize_t i, j, ngroups
+        Py_ssize_t N = values.shape[0], K = values.shape[1], b = 0
+        int16_t val
+        ndarray[int64_t, ndim=2] nobs = np.zeros((out.shape[0], out.shape[1]),
+                                                 dtype=np.int64)
+
+    ngroups = len(bins) + (bins[len(bins) - 1] != N)
+
+    for i in range(N):
+        while b < ngroups - 1 and i >= bins[b]:
+            b += 1
+
+        counts[b] += 1
+        for j in range(K):
+            val = values[i, j]
+
+            # not nan
+            nobs[b, j] += val == val and val != iNaT
+
+    for i in range(ngroups):
+        for j in range(K):
+            out[i, j] = nobs[i, j]
+
+
+@cython.boundscheck(False)
+@cython.wraparound(False)
+def group_count_bin_int32(ndarray[float64_t, ndim=2] out,
+                             ndarray[int64_t] counts,
+                             ndarray[int32_t, ndim=2] values,
+                             ndarray[int64_t] bins):
+    '''
+    Only aggregates on axis=0
+    '''
+    cdef:
+        Py_ssize_t i, j, ngroups
+        Py_ssize_t N = values.shape[0], K = values.shape[1], b = 0
+        int32_t val
+        ndarray[int64_t, ndim=2] nobs = np.zeros((out.shape[0], out.shape[1]),
+                                                 dtype=np.int64)
+
+    ngroups = len(bins) + (bins[len(bins) - 1] != N)
+
+    for i in range(N):
+        while b < ngroups - 1 and i >= bins[b]:
+            b += 1
+
+        counts[b] += 1
+        for j in range(K):
+            val = values[i, j]
+
+            # not nan
+            nobs[b, j] += val == val and val != iNaT
+
+    for i in range(ngroups):
+        for j in range(K):
+            out[i, j] = nobs[i, j]
+
+
+@cython.boundscheck(False)
+@cython.wraparound(False)
+def group_count_bin_int64(ndarray[float64_t, ndim=2] out,
+                             ndarray[int64_t] counts,
+                             ndarray[int64_t, ndim=2] values,
+                             ndarray[int64_t] bins):
+    '''
+    Only aggregates on axis=0
+    '''
+    cdef:
+        Py_ssize_t i, j, ngroups
+        Py_ssize_t N = values.shape[0], K = values.shape[1], b = 0
+        int64_t val
+        ndarray[int64_t, ndim=2] nobs = np.zeros((out.shape[0], out.shape[1]),
+                                                 dtype=np.int64)
+
+    ngroups = len(bins) + (bins[len(bins) - 1] != N)
+
+    for i in range(N):
+        while b < ngroups - 1 and i >= bins[b]:
+            b += 1
+
+        counts[b] += 1
+        for j in range(K):
+            val = values[i, j]
+
+            # not nan
+            nobs[b, j] += val == val and val != iNaT
+
+    for i in range(ngroups):
+        for j in range(K):
+            out[i, j] = nobs[i, j]
+
+
+@cython.boundscheck(False)
+@cython.wraparound(False)
+def group_count_bin_object(ndarray[float64_t, ndim=2] out,
+                             ndarray[int64_t] counts,
+                             ndarray[object, ndim=2] values,
+                             ndarray[int64_t] bins):
+    '''
+    Only aggregates on axis=0
+    '''
+    cdef:
+        Py_ssize_t i, j, ngroups
+        Py_ssize_t N = values.shape[0], K = values.shape[1], b = 0
+        object val
+        ndarray[int64_t, ndim=2] nobs = np.zeros((out.shape[0], out.shape[1]),
+                                                 dtype=np.int64)
+
+    ngroups = len(bins) + (bins[len(bins) - 1] != N)
+
+    for i in range(N):
+        while b < ngroups - 1 and i >= bins[b]:
+            b += 1
+
+        counts[b] += 1
+        for j in range(K):
+            val = values[i, j]
+
+            # not nan
+            nobs[b, j] += val == val and val != iNaT
+
+    for i in range(ngroups):
+        for j in range(K):
+            out[i, j] = nobs[i, j]
+
+
+
 @cython.wraparound(False)
 @cython.boundscheck(False)
 def left_join_indexer_unique_float64(ndarray[float64_t] left,
diff --git a/pandas/tests/test_groupby.py b/pandas/tests/test_groupby.py
index 1b70ae0309b10..eb3c28b672fd4 100644
--- a/pandas/tests/test_groupby.py
+++ b/pandas/tests/test_groupby.py
@@ -1451,7 +1451,6 @@ def test_groupby_head_tail(self):
         assert_frame_equal(g_not_as[['B']].head(1), df_as.loc[[0,2], ['B']])
         assert_frame_equal(g_not_as[['A', 'B']].head(1), df_as.loc[[0,2]])
 
-
     def test_groupby_multiple_key(self):
         df = tm.makeTimeDataFrame()
         grouped = df.groupby([lambda x: x.year,
@@ -1629,6 +1628,21 @@ def test_cython_agg_nothing_to_agg(self):
                            'b': ['foo', 'bar'] * 25})
         self.assertRaises(DataError, frame[['b']].groupby(frame['a']).mean)
 
+    def test_cython_agg_nothing_to_agg_with_dates(self):
+        frame = DataFrame({'a': np.random.randint(0, 5, 50),
+                           'b': ['foo', 'bar'] * 25,
+                           'dates': pd.date_range('now', periods=50,
+                                                  freq='T')})
+        with tm.assertRaisesRegexp(DataError, "No numeric types to aggregate"):
+            frame.groupby('b').dates.mean()
+
+    def test_groupby_timedelta_cython_count(self):
+        df = DataFrame({'g': list('ab' * 2),
+                        'delt': np.arange(4).astype('timedelta64[ns]')})
+        expected = Series([2, 2], index=['a', 'b'], name='delt')
+        result = df.groupby('g').delt.count()
+        tm.assert_series_equal(expected, result)
+
     def test_cython_agg_frame_columns(self):
         # #2113
         df = DataFrame({'x': [1, 2, 3], 'y': [3, 4, 5]})
@@ -1992,7 +2006,8 @@ def test_count(self):
 
         # GH5610
         # count counts non-nulls
-        df = pd.DataFrame([[1, 2, 'foo'], [1, nan, 'bar'], [3, nan, nan]], columns=['A', 'B', 'C'])
+        df = pd.DataFrame([[1, 2, 'foo'], [1, nan, 'bar'], [3, nan, nan]],
+                          columns=['A', 'B', 'C'])
 
         count_as = df.groupby('A').count()
         count_not_as = df.groupby('A', as_index=False).count()
@@ -2005,6 +2020,19 @@ def test_count(self):
         count_B = df.groupby('A')['B'].count()
         assert_series_equal(count_B, expected['B'])
 
+    def test_count_object(self):
+        df = pd.DataFrame({'a': ['a'] * 3 + ['b'] * 3,
+                           'c': [2] * 3 + [3] * 3})
+        result = df.groupby('c').a.count()
+        expected = pd.Series([3, 3], index=[2, 3], name='a')
+        tm.assert_series_equal(result, expected)
+
+        df = pd.DataFrame({'a': ['a', np.nan, np.nan] + ['b'] * 3,
+                           'c': [2] * 3 + [3] * 3})
+        result = df.groupby('c').a.count()
+        expected = pd.Series([1, 3], index=[2, 3], name='a')
+        tm.assert_series_equal(result, expected)
+
     def test_non_cython_api(self):
 
         # GH5610
@@ -2354,7 +2382,6 @@ def test_groupby_aggregation_mixed_dtype(self):
         result = g[['v1','v2']].mean()
         assert_frame_equal(result,expected)
 
-
     def test_groupby_dtype_inference_empty(self):
         # GH 6733
         df = DataFrame({'x': [], 'range': np.arange(0,dtype='int64')})
@@ -3325,7 +3352,6 @@ def test_cumcount_groupby_not_col(self):
         assert_series_equal(expected, g.cumcount())
         assert_series_equal(expected, sg.cumcount())
 
-
     def test_filter_series(self):
         import pandas as pd
         s = pd.Series([1, 3, 20, 5, 22, 24, 7])
@@ -4168,6 +4194,14 @@ def test_nargsort(self):
         expected = list(range(5)) + list(range(105, 110)) + list(range(104, 4, -1))
         assert_equal(result, expected)
 
+    def test_datetime_count(self):
+        df = DataFrame({'a': [1,2,3] * 2,
+                        'dates': pd.date_range('now', periods=6, freq='T')})
+        result = df.groupby('a').dates.count()
+        expected = Series([2, 2, 2], index=Index([1, 2, 3], name='a'),
+                          name='dates')
+        tm.assert_series_equal(result, expected)
+
 def assert_fp_equal(a, b):
     assert (np.abs(a - b) < 1e-12).all()
 
diff --git a/pandas/tslib.pyx b/pandas/tslib.pyx
index 6d99d38049e5a..df9c465c33853 100644
--- a/pandas/tslib.pyx
+++ b/pandas/tslib.pyx
@@ -462,6 +462,12 @@ class NaTType(_NaT):
     def __hash__(self):
         return iNaT
 
+    def __int__(self):
+        return NPY_NAT
+
+    def __long__(self):
+        return NPY_NAT
+
     def weekday(self):
         return -1
 
diff --git a/vb_suite/groupby.py b/vb_suite/groupby.py
index 01644153b28e1..638862ffd1367 100644
--- a/vb_suite/groupby.py
+++ b/vb_suite/groupby.py
@@ -119,6 +119,36 @@ def f():
                                setup, start_date=datetime(2011, 10, 1))
 
 #----------------------------------------------------------------------
+# count() speed
+
+setup = common_setup + """
+n = 10000
+offsets = np.random.randint(n, size=n).astype('timedelta64[ns]')
+
+dates = np.datetime64('now') + offsets
+dates[np.random.rand(n) > 0.5] = np.datetime64('nat')
+
+offsets[np.random.rand(n) > 0.5] = np.timedelta64('nat')
+
+value2 = np.random.randn(n)
+value2[np.random.rand(n) > 0.5] = np.nan
+
+obj = pd.util.testing.choice(['a', 'b'], size=n).astype(object)
+obj[np.random.randn(n) > 0.5] = np.nan
+
+df = DataFrame({'key1': np.random.randint(0, 500, size=n),
+                'key2': np.random.randint(0, 100, size=n),
+                'dates': dates,
+                'value2' : value2,
+                'value3' : np.random.randn(n),
+                'obj': obj,
+                'offsets': offsets})
+"""
+
+groupby_multi_count = Benchmark("df.groupby(['key1', 'key2']).count()",
+                                setup, name='groupby_multi_count',
+                                start_date=datetime(2014, 5, 5))
+#----------------------------------------------------------------------
 # Series.value_counts
 
 setup = common_setup + """
@@ -151,11 +181,11 @@ def f():
 ind2 = np.random.randint(0, 2, size=100000)
 
 df = DataFrame({'key1': fac1.take(ind1),
-                'key2': fac2.take(ind2),
-                'key3': fac2.take(ind2),
-                'value1' : np.random.randn(100000),
-                'value2' : np.random.randn(100000),
-                'value3' : np.random.randn(100000)})
+'key2': fac2.take(ind2),
+'key3': fac2.take(ind2),
+'value1' : np.random.randn(100000),
+'value2' : np.random.randn(100000),
+'value3' : np.random.randn(100000)})
 """
 
 stmt = "df.pivot_table(rows='key1', cols=['key2', 'key3'])"
@@ -192,13 +222,13 @@ def f():
                           start_date=datetime(2012, 5, 1))
 
 groupby_first_float32 = Benchmark('data2.groupby(labels).first()', setup,
-                          start_date=datetime(2013, 1, 1))
+                                  start_date=datetime(2013, 1, 1))
 
 groupby_last = Benchmark('data.groupby(labels).last()', setup,
                          start_date=datetime(2012, 5, 1))
 
 groupby_last_float32 = Benchmark('data2.groupby(labels).last()', setup,
-                         start_date=datetime(2013, 1, 1))
+                                 start_date=datetime(2013, 1, 1))
 
 
 #----------------------------------------------------------------------
@@ -256,9 +286,9 @@ def f():
 labels = np.random.randint(0, 2000, size=N)
 labels2 = np.random.randint(0, 3, size=N)
 df = DataFrame({'key': labels,
-                'key2': labels2,
-                'value1': randn(N),
-                'value2': ['foo', 'bar', 'baz', 'qux'] * (N / 4)})
+'key2': labels2,
+'value1': randn(N),
+'value2': ['foo', 'bar', 'baz', 'qux'] * (N / 4)})
 def f(g):
     return 1
 """