pandas-dev · jreback · Jan 30, 2017
diff --git a/pandas/parser.pyx b/pandas/parser.pyx
@@ -716,11 +716,10 @@ cdef class TextReader:
         # header is now a list of lists, so field_count should use header[0]
 
         cdef:
-            size_t i, start, data_line, field_count, passed_count, hr, unnamed_count  # noqa
+            Py_ssize_t i, start, field_count, passed_count, unnamed_count  # noqa
             char *word
             object name
-            int status
-            Py_ssize_t size
+            int status, hr, data_line
             char *errors = "strict"
             cdef StringPath path = _string_path(self.c_encoding)
 
@@ -1416,8 +1415,7 @@ cdef _string_box_factorize(parser_t *parser, int col,
                            bint na_filter, kh_str_t *na_hashset):
     cdef:
         int error, na_count = 0
-        Py_ssize_t i
-        size_t lines
+        Py_ssize_t i, lines
         coliter_t it
         const char *word = NULL
         ndarray[object] result
@@ -1470,8 +1468,7 @@ cdef _string_box_utf8(parser_t *parser, int col,
                       bint na_filter, kh_str_t *na_hashset):
     cdef:
         int error, na_count = 0
-        Py_ssize_t i
-        size_t lines
+        Py_ssize_t i, lines
         coliter_t it
         const char *word = NULL
         ndarray[object] result
@@ -1525,8 +1522,7 @@ cdef _string_box_decode(parser_t *parser, int col,
                         char *encoding):
     cdef:
         int error, na_count = 0
-        Py_ssize_t i, size
-        size_t lines
+        Py_ssize_t i, size, lines
         coliter_t it
         const char *word = NULL
         ndarray[object] result
@@ -1586,8 +1582,7 @@ cdef _categorical_convert(parser_t *parser, int col,
     "Convert column data into codes, categories"
     cdef:
         int error, na_count = 0
-        Py_ssize_t i, size
-        size_t lines
+        Py_ssize_t i, size, lines
         coliter_t it
         const char *word = NULL
 
@@ -1691,7 +1686,7 @@ cdef _try_double(parser_t *parser, int col, int line_start, int line_end,
                  bint na_filter, kh_str_t *na_hashset, object na_flist):
     cdef:
         int error, na_count = 0
-        size_t i, lines
+        Py_ssize_t i, lines
         coliter_t it
         const char *word = NULL
         char *p_end
@@ -1738,8 +1733,7 @@ cdef inline int _try_double_nogil(parser_t *parser,
                                   int *na_count) nogil:
     cdef:
         int error,
-        size_t i
-        size_t lines = line_end - line_start
+        Py_ssize_t i, lines = line_end - line_start
         coliter_t it
         const char *word = NULL
         char *p_end
@@ -1801,7 +1795,7 @@ cdef _try_uint64(parser_t *parser, int col, int line_start, int line_end,
                  bint na_filter, kh_str_t *na_hashset):
     cdef:
         int error
-        size_t i, lines
+        Py_ssize_t i, lines
         coliter_t it
         uint64_t *data
         ndarray result
@@ -1837,8 +1831,7 @@ cdef inline int _try_uint64_nogil(parser_t *parser, int col, int line_start,
                                   uint64_t *data, uint_state *state) nogil:
     cdef:
         int error
-        size_t i
-        size_t lines = line_end - line_start
+        Py_ssize_t i, lines = line_end - line_start
         coliter_t it
         const char *word = NULL
         khiter_t k
@@ -1873,7 +1866,7 @@ cdef _try_int64(parser_t *parser, int col, int line_start, int line_end,
                 bint na_filter, kh_str_t *na_hashset):
     cdef:
         int error, na_count = 0
-        size_t i, lines
+        Py_ssize_t i, lines
         coliter_t it
         int64_t *data
         ndarray result
@@ -1902,8 +1895,7 @@ cdef inline int _try_int64_nogil(parser_t *parser, int col, int line_start,
                                  int64_t *data, int *na_count) nogil:
     cdef:
         int error
-        size_t i
-        size_t lines = line_end - line_start
+        Py_ssize_t i, lines = line_end - line_start
         coliter_t it
         const char *word = NULL
         khiter_t k
@@ -1939,7 +1931,7 @@ cdef _try_bool(parser_t *parser, int col, int line_start, int line_end,
                bint na_filter, kh_str_t *na_hashset):
     cdef:
         int na_count
-        size_t lines = line_end - line_start
+        Py_ssize_t lines = line_end - line_start
         uint8_t *data
         cnp.ndarray[cnp.uint8_t, ndim=1] result
 
@@ -1963,8 +1955,7 @@ cdef inline int _try_bool_nogil(parser_t *parser, int col, int line_start,
                                 uint8_t *data, int *na_count) nogil:
     cdef:
         int error
-        size_t lines = line_end - line_start
-        size_t i
+        Py_ssize_t i, lines = line_end - line_start
         coliter_t it
         const char *word = NULL
         khiter_t k
@@ -2004,7 +1995,7 @@ cdef _try_bool_flex(parser_t *parser, int col, int line_start, int line_end,
                     const kh_str_t *false_hashset):
     cdef:
         int error, na_count = 0
-        size_t i, lines
+        Py_ssize_t i, lines
         coliter_t it
         const char *word = NULL
         uint8_t *data
@@ -2033,8 +2024,7 @@ cdef inline int _try_bool_flex_nogil(parser_t *parser, int col, int line_start,
                                      int *na_count) nogil:
     cdef:
         int error = 0
-        size_t i
-        size_t lines = line_end - line_start
+        Py_ssize_t i, lines = line_end - line_start
         coliter_t it
         const char *word = NULL
         khiter_t k
@@ -2249,8 +2239,7 @@ cdef _apply_converter(object f, parser_t *parser, int col,
                       char* c_encoding):
     cdef:
         int error
-        Py_ssize_t i
-        size_t lines
+        Py_ssize_t i, lines
         coliter_t it
         const char *word = NULL
         char *errors = "strict"
@@ -2341,7 +2330,7 @@ def _to_structured_array(dict columns, object names, object usecols):
 cdef _fill_structured_column(char *dst, char* src, int elsize,
                              int stride, int length, bint incref):
     cdef:
-        size_t i
+        Py_ssize_t i
 
     if incref:
         util.transfer_object_column(dst, src, stride, length)

diff --git a/pandas/src/algos_groupby_helper.pxi.in b/pandas/src/algos_groupby_helper.pxi.in
@@ -361,7 +361,11 @@ def group_last_{{name}}(ndarray[{{dest_type2}}, ndim=2] out,
                 val = values[i, j]
 
                 # not nan
+                {{if name == 'int64'}}
+                if val != {{nan_val}}:
+                {{else}}
                 if val == val and val != {{nan_val}}:
+                {{endif}}
                     nobs[lab, j] += 1
                     resx[lab, j] = val
 
@@ -407,7 +411,11 @@ def group_nth_{{name}}(ndarray[{{dest_type2}}, ndim=2] out,
                 val = values[i, j]
 
                 # not nan
+                {{if name == 'int64'}}
+                if val != {{nan_val}}:
+                {{else}}
                 if val == val and val != {{nan_val}}:
+                {{endif}}
                     nobs[lab, j] += 1
                     if nobs[lab, j] == rank:
                         resx[lab, j] = val
@@ -478,7 +486,11 @@ def group_max_{{name}}(ndarray[{{dest_type2}}, ndim=2] out,
                     val = values[i, j]
 
                     # not nan
+                    {{if name == 'int64'}}
+                    if val != {{nan_val}}:
+                    {{else}}
                     if val == val and val != {{nan_val}}:
+                    {{endif}}
                         nobs[lab, j] += 1
                         if val > maxx[lab, j]:
                             maxx[lab, j] = val
@@ -492,7 +504,11 @@ def group_max_{{name}}(ndarray[{{dest_type2}}, ndim=2] out,
                 val = values[i, 0]
 
                 # not nan
+                {{if name == 'int64'}}
+                if val != {{nan_val}}:
+                {{else}}
                 if val == val and val != {{nan_val}}:
+                {{endif}}
                     nobs[lab, 0] += 1
                     if val > maxx[lab, 0]:
                         maxx[lab, 0] = val
@@ -541,8 +557,11 @@ def group_min_{{name}}(ndarray[{{dest_type2}}, ndim=2] out,
                     val = values[i, j]
 
                     # not nan
+                    {{if name == 'int64'}}
+                    if val != {{nan_val}}:
+                    {{else}}
                     if val == val and val != {{nan_val}}:
-
+                    {{endif}}
                         nobs[lab, j] += 1
                         if val < minx[lab, j]:
                             minx[lab, j] = val
@@ -556,7 +575,11 @@ def group_min_{{name}}(ndarray[{{dest_type2}}, ndim=2] out,
                 val = values[i, 0]
 
                 # not nan
+                {{if name == 'int64'}}
+                if val != {{nan_val}}:
+                {{else}}
                 if val == val and val != {{nan_val}}:
+                {{endif}}
                     nobs[lab, 0] += 1
                     if val < minx[lab, 0]:
                         minx[lab, 0] = val
@@ -596,14 +619,19 @@ def group_cummin_{{name}}(ndarray[{{dest_type2}}, ndim=2] out,
                 continue
             for j in range(K):
                 val = values[i, j]
+
+                # val = nan
+                {{if name == 'int64'}}
+                if is_datetimelike and val == {{nan_val}}:
+                    out[i, j] = {{nan_val}}
+                else:
+                {{else}}
                 if val == val:
+                {{endif}}
                     if val < accum[lab, j]:
                         min_val = val
                     accum[lab, j] = min_val
                     out[i, j] = accum[lab, j]
-                # val = nan
-                elif is_datetimelike:
-                    out[i, j] = {{nan_val}}
 
 
 @cython.boundscheck(False)
@@ -633,14 +661,18 @@ def group_cummax_{{name}}(ndarray[{{dest_type2}}, ndim=2] out,
                 continue
             for j in range(K):
                 val = values[i, j]
+
+                {{if name == 'int64'}}
+                if is_datetimelike and val == {{nan_val}}:
+                    out[i, j] = {{nan_val}}
+                else:
+                {{else}}
                 if val == val:
+                {{endif}}
                     if val > accum[lab, j]:
                         max_val = val
                     accum[lab, j] = max_val
                     out[i, j] = accum[lab, j]
-                # val = nan
-                elif is_datetimelike:
-                    out[i, j] = {{nan_val}}
 
 {{endfor}}
 
@@ -738,7 +770,12 @@ def group_cumsum(numeric[:, :] out,
                 continue
             for j in range(K):
                 val = values[i, j]
-                if val == val:
+
+                if numeric == float32_t or numeric == float64_t:
+                    if val == val:
+                        accum[lab, j] += val
+                        out[i, j] = accum[lab, j]
+                else:
                     accum[lab, j] += val
                     out[i, j] = accum[lab, j]
 

diff --git a/pandas/src/algos_rank_helper.pxi.in b/pandas/src/algos_rank_helper.pxi.in
@@ -175,11 +175,7 @@ def rank_1d_{{dtype}}(object in_arr, ties_method='average', ascending=True,
 
             count += 1.0
 
-            {{if dtype == 'float64'}}
             if i == n - 1 or sorted_data[i + 1] != val:
-            {{else}}
-            if i == n - 1 or fabs(sorted_data[i + 1] - val) > 0:
-            {{endif}}
                 if tiebreak == TIEBREAK_AVERAGE:
                     for j in range(i - dups + 1, i + 1):
                         ranks[argsorted[j]] = sum_ranks / dups
@@ -345,10 +341,8 @@ def rank_2d_{{dtype}}(object in_arr, axis=0, ties_method='average',
 
             {{if dtype == 'object'}}
             if j == k - 1 or are_diff(values[i, j + 1], val):
-            {{elif dtype == 'float64'}}
-            if j == k - 1 or values[i, j + 1] != val:
             {{else}}
-            if j == k - 1 or fabs(values[i, j + 1] - val) > FP_ERR:
+            if j == k - 1 or values[i, j + 1] != val:
             {{endif}}
                 if tiebreak == TIEBREAK_AVERAGE:
                     for z in range(j - dups + 1, j + 1):

diff --git a/pandas/src/hashtable_class_helper.pxi.in b/pandas/src/hashtable_class_helper.pxi.in
@@ -386,9 +386,11 @@ cdef class {{name}}HashTable(HashTable):
                 val = values[i]
 
                 # specific for groupby
+                {{if dtype != 'uint64'}}
                 if val < 0:
                     labels[i] = -1
                     continue
+                {{endif}}
 
                 k = kh_get_{{dtype}}(self.table, val)
                 if k != self.table.n_buckets:

diff --git a/pandas/src/hashtable_func_helper.pxi.in b/pandas/src/hashtable_func_helper.pxi.in
@@ -59,7 +59,12 @@ cdef build_count_table_{{dtype}}({{dtype}}_t[:] values,
 
         for i in range(n):
             val = values[i]
+
+            {{if dtype == 'float64'}}
             if val == val or not dropna:
+            {{else}}
+            if True:
+            {{endif}}
                 k = kh_get_{{ttype}}(table, val)
                 if k != table.n_buckets:
                     table.vals[k] += 1
@@ -85,7 +90,7 @@ cpdef value_count_{{dtype}}({{dtype}}_t[:] values, bint dropna):
         int64_t[:] result_counts
         {{endif}}
 
-        int k
+        Py_ssize_t k
 
     table = kh_init_{{ttype}}()
     {{if dtype == 'object'}}
@@ -133,11 +138,11 @@ def duplicated_{{dtype}}(ndarray[{{dtype}}] values, object keep='first'):
 def duplicated_{{dtype}}({{dtype}}_t[:] values, object keep='first'):
 {{endif}}
     cdef:
-        int ret = 0, k
+        int ret = 0
         {{if dtype != 'object'}}
         {{dtype}}_t value
         {{endif}}
-        Py_ssize_t i, n = len(values)
+        Py_ssize_t k, i, n = len(values)
         kh_{{ttype}}_t * table = kh_init_{{ttype}}()
         ndarray[uint8_t, ndim=1, cast=True] out = np.empty(n, dtype='bool')
 
@@ -230,7 +235,7 @@ def mode_{{dtype}}({{ctype}}[:] values):
     cdef:
         int count, max_count = 2
         int j = -1 # so you can do +=
-        int k
+        Py_ssize_t k
         kh_{{table_type}}_t *table
         ndarray[{{ctype}}] modes
 

diff --git a/pandas/src/parser/io.c b/pandas/src/parser/io.c
@@ -215,7 +215,7 @@ void *buffer_mmap_bytes(void *source, size_t nbytes, size_t *bytes_read,
 
     retval = src->memmap + src->position;
 
-    if (src->position + nbytes > src->last_pos) {
+    if (src->position + (off_t)nbytes > src->last_pos) {
         // fewer than nbytes remaining
         *bytes_read = src->last_pos - src->position;
     } else {