Skip to content

Commit d63cee8

Browse files
committed
ENH: preliminary Cython code and test type conversion
1 parent c84e9a8 commit d63cee8

File tree

4 files changed

+186
-13
lines changed

4 files changed

+186
-13
lines changed

pandas/io/tests/test_cparser.py

+66
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,66 @@
1+
"""
2+
C/Cython ascii file parser tests
3+
"""
4+
5+
from pandas.util.py3compat import StringIO, BytesIO
6+
from datetime import datetime
7+
import csv
8+
import os
9+
import sys
10+
import re
11+
import unittest
12+
13+
import nose
14+
15+
from numpy import nan
16+
import numpy as np
17+
18+
from pandas import DataFrame, Series, Index, isnull, MultiIndex
19+
import pandas.io.parsers as parsers
20+
from pandas.io.parsers import (read_csv, read_table, read_fwf,
21+
ExcelFile, TextParser)
22+
from pandas.util.testing import (assert_almost_equal, assert_frame_equal,
23+
assert_series_equal, network)
24+
import pandas.lib as lib
25+
from pandas.util import py3compat
26+
from pandas.lib import Timestamp
27+
from pandas.tseries.index import date_range
28+
29+
30+
import pandas._parser as parser
31+
32+
33+
def curpath():
34+
pth, _ = os.path.split(os.path.abspath(__file__))
35+
return pth
36+
37+
class TestCParser(unittest.TestCase):
38+
39+
def setUp(self):
40+
self.dirpath = curpath()
41+
self.csv1 = os.path.join(self.dirpath, 'test1.csv')
42+
self.csv2 = os.path.join(self.dirpath, 'test2.csv')
43+
self.xls1 = os.path.join(self.dirpath, 'test.xls')
44+
45+
def test_string_filename(self):
46+
reader = parser.TextReader(self.csv1)
47+
result = reader.read()
48+
49+
def test_file_handle(self):
50+
try:
51+
f = open(self.csv1, 'rb')
52+
reader = parser.TextReader(f)
53+
result = reader.read()
54+
finally:
55+
f.close()
56+
57+
# def test_StringIO(self):
58+
# text = open(self.csv1, 'rb').read()
59+
60+
# reader = parser.TextReader(BytesIO(text))
61+
# result = reader.read()
62+
63+
if __name__ == '__main__':
64+
nose.runmodule(argv=[__file__,'-vvs','-x','--pdb', '--pdb-failure'],
65+
exit=False)
66+

pandas/src/parser.pyx

+88-6
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,19 @@
1+
cimport numpy as cnp
2+
import numpy as np
3+
4+
cnp.import_array()
5+
16
cdef extern from "Python.h":
27
ctypedef struct FILE
38
FILE* PyFile_AsFile(object)
49

10+
cdef extern from "parser/conversions.h":
11+
inline int to_double(char *item, double *p_value,
12+
char sci, char decimal)
13+
inline int to_complex(char *item, double *p_real,
14+
double *p_imag, char sci, char decimal)
15+
inline int to_longlong(char *item, long long *p_value)
16+
517

618
cdef extern from "parser/common.h":
719

@@ -91,6 +103,15 @@ cdef extern from "parser/common.h":
91103
# error handling
92104
char *error_msg
93105

106+
ctypedef struct coliter_t:
107+
char **words
108+
int *line_start
109+
int col
110+
int line
111+
112+
void coliter_setup(coliter_t *it, parser_t *parser, int i)
113+
char* COLITER_NEXT(coliter_t it)
114+
94115
parser_t* parser_new()
95116

96117
int parser_init(parser_t *self)
@@ -107,6 +128,8 @@ cdef extern from "parser/common.h":
107128
int tokenize_all_rows(parser_t *self)
108129
int tokenize_nrows(parser_t *self, size_t nrows)
109130

131+
DEFAULT_CHUNKSIZE = 256 * 1024
132+
110133
cdef class TextReader:
111134
'''
112135
@@ -118,21 +141,29 @@ cdef class TextReader:
118141
parser_t *parser
119142
object file_handle, should_close
120143

144+
cdef public:
145+
object delimiter, na_values, converters, thousands, delim_whitespace
146+
121147
def __cinit__(self, source, delimiter=',', header=0, memory_map=False,
148+
chunksize=DEFAULT_CHUNKSIZE,
122149
delim_whitespace=False,
123150
na_values=None,
124151
converters=None,
125152
thousands=None):
126153
self.parser = parser_new()
154+
self.parser.chunksize = chunksize
127155

128156
self._setup_parser_source(source)
157+
set_parser_default_options(self.parser)
158+
159+
parser_init(self.parser)
129160

130161
if delim_whitespace:
131162
raise NotImplementedError
132163
else:
133164
if len(delimiter) > 1:
134165
raise ValueError('only length-1 separators excluded right now')
135-
self.parser.delimiter = delimiter
166+
self.parser.delimiter = (<char*> delimiter)[0]
136167

137168
# TODO: no header vs. header is not the first row
138169
self.parser.header = header
@@ -142,7 +173,7 @@ cdef class TextReader:
142173
self.delimiter = delimiter
143174
self.delim_whitespace = delim_whitespace
144175

145-
self.na_values
176+
self.na_values = na_values
146177
self.converters = converters
147178
self.thousands = thousands
148179

@@ -158,12 +189,13 @@ cdef class TextReader:
158189

159190
if isinstance(source, (basestring, file)):
160191
if isinstance(source, basestring):
161-
self.file_handle = open(source, 'rb')
192+
source = open(source, 'rb')
162193
self.should_close = True
163-
source = self.file_handle
164194

195+
self.file_handle = source
165196
status = parser_file_source_init(self.parser,
166197
PyFile_AsFile(source))
198+
167199
if status != 0:
168200
raise Exception('Initializing from file failed')
169201

@@ -189,20 +221,70 @@ cdef class TextReader:
189221
"""
190222
rows=None --> read all rows
191223
"""
192-
cdef int status
224+
cdef:
225+
int prior_lines
226+
int status
193227

194228
if rows is not None:
195229
raise NotImplementedError
196230
else:
197231
status = tokenize_all_rows(self.parser)
198232

233+
if status < 0:
234+
raise_parser_error('Error tokenizing data', self.parser)
235+
236+
result = self._convert_column_data()
237+
238+
# debug_print_parser(self.parser)
239+
return result
240+
241+
def _convert_column_data(self):
242+
cdef:
243+
Py_ssize_t i, ncols
244+
245+
ncols = self.parser.line_fields[0]
246+
247+
results = {}
248+
for i in range(ncols):
249+
col_res = _try_double(self.parser, i, 0, self.parser.lines)
250+
251+
results[i] = col_res
252+
253+
return results
199254

200255
class CParserError(Exception):
201256
pass
202257

203258

259+
cdef _try_double(parser_t *parser, int col, int line_start, int line_end):
260+
cdef:
261+
int error
262+
size_t i, lines
263+
coliter_t it
264+
char *word
265+
double *data
266+
cnp.ndarray result
267+
268+
lines = line_end - line_start
269+
270+
result = np.empty(lines, dtype=np.float64)
271+
272+
data = <double *> result.data
273+
274+
coliter_setup(&it, parser, col)
275+
for i in range(lines):
276+
word = COLITER_NEXT(it)
277+
error = to_double(word, data, parser.sci, parser.decimal)
278+
279+
if error != 1:
280+
return None
281+
282+
data += 1
283+
284+
return result
285+
204286
cdef raise_parser_error(object base, parser_t *parser):
205-
message = '%s. C error: '
287+
message = '%s. C error: ' % base
206288
if parser.error_msg != NULL:
207289
message += parser.error_msg
208290
else:

pandas/src/parser/common.h

+2-1
Original file line numberDiff line numberDiff line change
@@ -178,7 +178,8 @@ typedef struct coliter_t {
178178
int line;
179179
} coliter_t;
180180

181-
181+
void coliter_setup(coliter_t *self, parser_t *parser, int i);
182+
coliter_t *coliter_new(parser_t *self, int i);
182183

183184
/* #define COLITER_NEXT(iter) iter->words[iter->line_start[iter->line++] + iter->col] */
184185
#define COLITER_NEXT(iter) iter.words[iter.line_start[iter.line++] + iter.col]

pandas/src/parser/rows.c

+30-6
Original file line numberDiff line numberDiff line change
@@ -130,6 +130,7 @@ void del_file_source(void *fs) {
130130
// TODO: error codes?
131131
// fclose(FS(fs)->fp);
132132

133+
// fseek(FS(fs)->fp, FS(fs)->initial_file_pos, SEEK_SET);
133134
// allocated on the heap
134135
free(fs);
135136
}
@@ -621,7 +622,7 @@ int convert_infer(parser_t *parser, array_t* result,
621622

622623
void set_parser_default_options(parser_t *self) {
623624
// File buffer preferences
624-
self->sourcetype = 'F';
625+
// self->sourcetype = 'F';
625626

626627
// parsing, type inference
627628
self->infer_types = 1;
@@ -659,10 +660,11 @@ parser_t* parser_new() {
659660
}
660661

661662
int parser_file_source_init(parser_t *self, FILE* fp) {
662-
FS(self->source)->fp = fp;
663+
self->sourcetype = 'F';
664+
self->source = new_file_source(fp);
663665

664666
// Only allocate this heap memory if we are not memory-mapping the file
665-
self->data = (char*) malloc(self->chunksize * sizeof(char));
667+
self->data = (char*) malloc((self->chunksize + 1) * sizeof(char));
666668

667669
if (self->data == NULL) {
668670
return PARSER_OUT_OF_MEMORY;
@@ -680,7 +682,9 @@ int parser_gzip_source_init(parser_t *self, FILE* fp) {
680682
}
681683

682684
int parser_array_source_init(parser_t *self, char *bytes, size_t length) {
685+
self->sourcetype = 'A';
683686
self->source = new_array_source(bytes, length);
687+
return 0;
684688
}
685689

686690
int parser_init(parser_t *self) {
@@ -959,15 +963,15 @@ int parser_cleanup(parser_t *self) {
959963

960964
int parser_buffer_bytes(parser_t *self, size_t nbytes) {
961965
size_t bytes;
966+
void *src = self->source;
962967

963968
// This should probably end up as a method table
964969

965970
switch(self->sourcetype) {
966971
case 'F': // basic FILE*
967972

968-
bytes = fread((void *) self->data,
969-
sizeof(char), nbytes,
970-
FS(self->source)->fp);
973+
bytes = fread((void *) self->data, sizeof(char), nbytes,
974+
FS(src)->fp);
971975
self->datalen = bytes;
972976

973977
TRACE(("Read %d bytes\n", (int) bytes));
@@ -980,6 +984,24 @@ int parser_buffer_bytes(parser_t *self, size_t nbytes) {
980984
break;
981985

982986
case 'A': // in-memory bytes (e.g. from StringIO)
987+
if (ARS(src)->position == ARS(src)->length) {
988+
return REACHED_EOF;
989+
}
990+
991+
self->data = ARS(src)->data + ARS(src)->position;
992+
993+
if (ARS(src)->position + nbytes > ARS(src)->length) {
994+
// fewer than nbytes remaining
995+
self->datalen = ARS(src)->length - ARS(src)->position;
996+
} else {
997+
self->datalen = nbytes;
998+
}
999+
1000+
ARS(src)->position += self->datalen;
1001+
1002+
TRACE(("datalen: %d\n", self->datalen));
1003+
1004+
TRACE(("pos: %d, length: %d", ARS(src)->position, ARS(src)->length));
9831005

9841006
break;
9851007

@@ -1310,6 +1332,8 @@ int _tokenize_helper(parser_t *self, size_t nrows, int all) {
13101332

13111333
status = parser_buffer_bytes(self, self->chunksize);
13121334

1335+
TRACE(("sourcetype: %c, status: %d\n", self->sourcetype, status));
1336+
13131337
if (status == REACHED_EOF) {
13141338
// XXX close last line
13151339
status = parser_handle_eof(self);

0 commit comments

Comments
 (0)