Skip to content

Commit c4d36ca

Browse files
committed
ENH: int parsing, etc.
1 parent a9db003 commit c4d36ca

File tree

3 files changed

+83
-16
lines changed

3 files changed

+83
-16
lines changed

pandas/src/parser.pyx

+70-8
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,19 @@
1-
cimport numpy as cnp
1+
from numpy cimport *
22
import numpy as np
33

4-
cnp.import_array()
4+
import_array()
55

66
from khash cimport *
77

88
from cpython cimport PyString_FromString, Py_INCREF, PyString_AsString
99

10+
cdef extern from "stdint.h":
11+
enum: INT64_MAX
12+
enum: INT64_MIN
13+
enum: INT32_MAX
14+
enum: INT32_MIN
15+
16+
1017
cdef extern from "Python.h":
1118
ctypedef struct FILE
1219
FILE* PyFile_AsFile(object)
@@ -132,6 +139,12 @@ cdef extern from "parser/common.h":
132139
int tokenize_all_rows(parser_t *self)
133140
int tokenize_nrows(parser_t *self, size_t nrows)
134141

142+
int64_t str_to_int64(char *p_item, int64_t int_min,
143+
int64_t int_max, int *error)
144+
uint64_t str_to_uint64(char *p_item, uint64_t uint_max, int *error)
145+
146+
147+
135148
DEFAULT_CHUNKSIZE = 256 * 1024
136149

137150
cdef class TextReader:
@@ -144,6 +157,7 @@ cdef class TextReader:
144157
cdef:
145158
parser_t *parser
146159
object file_handle, should_close
160+
bint factorize
147161

148162
cdef public:
149163
object delimiter, na_values, converters, thousands, delim_whitespace
@@ -155,7 +169,8 @@ cdef class TextReader:
155169
delim_whitespace=False,
156170
na_values=None,
157171
converters=None,
158-
thousands=None):
172+
thousands=None,
173+
factorize=True):
159174
self.parser = parser_new()
160175
self.parser.chunksize = chunksize
161176

@@ -171,6 +186,8 @@ cdef class TextReader:
171186
raise ValueError('only length-1 separators excluded right now')
172187
self.parser.delimiter = (<char*> delimiter)[0]
173188

189+
self.factorize = factorize
190+
174191
# TODO: no header vs. header is not the first row
175192
self.parser.header = header
176193

@@ -255,16 +272,27 @@ cdef class TextReader:
255272
def _convert_column_data(self):
256273
cdef:
257274
Py_ssize_t i, ncols
275+
cast_func func
258276

259277
ncols = self.parser.line_fields[0]
260278

261279
results = {}
262280
for i in range(ncols):
263-
col_res = _try_double(self.parser, i, 0, self.parser.lines)
281+
col_res = None
282+
for func in cast_func_order:
283+
col_res = func(self.parser, i, 0, self.parser.lines)
284+
if col_res is not None:
285+
results[i] = col_res
286+
break
264287

265288
if col_res is None:
266-
col_res = _string_box_factorize(self.parser, i,
267-
0, self.parser.lines)
289+
raise Exception('Unable to parse column %d' % i)
290+
291+
# col_res = _try_double(self.parser, i, 0, self.parser.lines)
292+
293+
# if col_res is None:
294+
# col_res = _string_box_factorize(self.parser, i,
295+
# 0, self.parser.lines)
268296

269297
results[i] = col_res
270298

@@ -277,6 +305,9 @@ class CParserError(Exception):
277305
# ----------------------------------------------------------------------
278306
# Type conversions / inference support code
279307

308+
ctypedef object (*cast_func)(parser_t *parser, int col,
309+
int line_start, int line_end)
310+
280311
cdef _string_box_factorize(parser_t *parser, int col,
281312
int line_start, int line_end):
282313
cdef:
@@ -285,7 +316,7 @@ cdef _string_box_factorize(parser_t *parser, int col,
285316
size_t lines
286317
coliter_t it
287318
char *word
288-
cnp.ndarray[object] result
319+
ndarray[object] result
289320

290321
int ret = 0
291322
kh_strbox_t *table
@@ -329,7 +360,7 @@ cdef _try_double(parser_t *parser, int col, int line_start, int line_end):
329360
coliter_t it
330361
char *word
331362
double *data
332-
cnp.ndarray result
363+
ndarray result
333364

334365
lines = line_end - line_start
335366

@@ -349,6 +380,37 @@ cdef _try_double(parser_t *parser, int col, int line_start, int line_end):
349380

350381
return result
351382

383+
cdef _try_int64(parser_t *parser, int col, int line_start, int line_end):
384+
cdef:
385+
int error
386+
size_t i, lines
387+
coliter_t it
388+
char *word
389+
int64_t *data
390+
ndarray result
391+
392+
lines = line_end - line_start
393+
394+
result = np.empty(lines, dtype=np.int64)
395+
396+
data = <int64_t *> result.data
397+
398+
coliter_setup(&it, parser, col)
399+
for i in range(lines):
400+
word = COLITER_NEXT(it)
401+
data[i] = str_to_int64(word, INT64_MIN, INT64_MAX, &error);
402+
403+
if error != 0:
404+
return None
405+
406+
return result
407+
408+
409+
cdef cast_func cast_func_order[3]
410+
cast_func_order[0] = _try_int64
411+
cast_func_order[1] = _try_double
412+
cast_func_order[2] = _string_box_factorize
413+
352414
cdef raise_parser_error(object base, parser_t *parser):
353415
message = '%s. C error: ' % base
354416
if parser.error_msg != NULL:

pandas/src/parser/common.h

+11
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,13 @@
99
#include <time.h>
1010
#include <errno.h>
1111

12+
13+
#if defined(_MSC_VER)
14+
#include "ms_stdint.h"
15+
#else
16+
#include <stdint.h>
17+
#endif
18+
1219
// #include "Python.h"
1320
// #include "structmember.h"
1421

@@ -214,4 +221,8 @@ int tokenize_all_rows(parser_t *self);
214221
*/
215222
int clear_parsed_lines(parser_t *self, size_t nlines);
216223

224+
int64_t str_to_int64(const char *p_item, int64_t int_min,
225+
int64_t int_max, int *error);
226+
uint64_t str_to_uint64(const char *p_item, uint64_t uint_max, int *error);
227+
217228
#endif // _PARSER_COMMON_H_

pandas/src/parser/rows.c

+2-8
Original file line numberDiff line numberDiff line change
@@ -55,14 +55,8 @@
5555

5656

5757

58-
int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max, int *error);
59-
uint64_t str_to_uint64(const char *p_item, uint64_t uint_max, int *error);
60-
61-
#if defined(_MSC_VER)
62-
#include "ms_stdint.h"
63-
#else
64-
#include <stdint.h>
65-
#endif
58+
/* int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max, int *error); */
59+
/* uint64_t str_to_uint64(const char *p_item, uint64_t uint_max, int *error); */
6660

6761

6862
void free_if_not_null(void *ptr) {

0 commit comments

Comments
 (0)