forked from pandas-dev/pandas
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtokenizer.h
278 lines (202 loc) · 6.84 KB
/
tokenizer.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
/*
Copyright (c) 2012, Lambda Foundry, Inc., except where noted
Incorporates components of WarrenWeckesser/textreader, licensed under 3-clause
BSD
See LICENSE for the license
*/
#ifndef _PARSER_COMMON_H_
#define _PARSER_COMMON_H_
#include "Python.h"
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <time.h>
#include <errno.h>
#include <ctype.h>
#define ERROR_OK 0
#define ERROR_NO_DIGITS 1
#define ERROR_OVERFLOW 2
#define ERROR_INVALID_CHARS 3
#define ERROR_MINUS_SIGN 4
#include "../headers/stdint.h"
#include "khash.h"
#define CHUNKSIZE 1024*256
#define KB 1024
#define MB 1024 * KB
#define STREAM_INIT_SIZE 32
#define REACHED_EOF 1
#define CALLING_READ_FAILED 2
#ifndef P_INLINE
#if defined(__GNUC__)
#define P_INLINE static __inline__
#elif defined(_MSC_VER)
#define P_INLINE
#elif defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L
#define P_INLINE static inline
#else
#define P_INLINE
#endif
#endif
#if defined(_MSC_VER)
#define strtoll _strtoi64
#endif
/*
C flat file parsing low level code for pandas / NumPy
*/
#define FALSE 0
#define TRUE 1
/* Maximum number of columns in a file. */
#define MAX_NUM_COLUMNS 2000
/* Maximum number of characters in single field. */
#define FIELD_BUFFER_SIZE 2000
/*
* Common set of error types for the read_rows() and tokenize()
* functions.
*/
#define ERROR_OUT_OF_MEMORY 1
#define ERROR_INVALID_COLUMN_INDEX 10
#define ERROR_CHANGED_NUMBER_OF_FIELDS 12
#define ERROR_TOO_MANY_CHARS 21
#define ERROR_TOO_MANY_FIELDS 22
#define ERROR_NO_DATA 23
/* #define VERBOSE */
#if defined(VERBOSE)
#define TRACE(X) printf X;
#else
#define TRACE(X)
#endif
#define PARSER_OUT_OF_MEMORY -1
/*
* XXX Might want to couple count_rows() with read_rows() to avoid duplication
* of some file I/O.
*/
/*
* WORD_BUFFER_SIZE determines the maximum amount of non-delimiter
* text in a row.
*/
#define WORD_BUFFER_SIZE 4000
typedef enum {
START_RECORD,
START_FIELD,
ESCAPED_CHAR,
IN_FIELD,
IN_QUOTED_FIELD,
ESCAPE_IN_QUOTED_FIELD,
QUOTE_IN_QUOTED_FIELD,
EAT_CRNL,
EAT_CRNL_NOP,
EAT_WHITESPACE,
EAT_COMMENT,
EAT_LINE_COMMENT,
WHITESPACE_LINE,
SKIP_LINE,
QUOTE_IN_SKIP_LINE,
QUOTE_IN_QUOTE_IN_SKIP_LINE,
FINISHED
} ParserState;
typedef enum {
QUOTE_MINIMAL, QUOTE_ALL, QUOTE_NONNUMERIC, QUOTE_NONE
} QuoteStyle;
typedef void* (*io_callback)(void *src, size_t nbytes, size_t *bytes_read,
int *status);
typedef int (*io_cleanup)(void *src);
typedef struct parser_t {
void *source;
io_callback cb_io;
io_cleanup cb_cleanup;
int chunksize; // Number of bytes to prepare for each chunk
char *data; // pointer to data to be processed
int datalen; // amount of data available
int datapos;
// where to write out tokenized data
char *stream;
int stream_len;
int stream_cap;
// Store words in (potentially ragged) matrix for now, hmm
char **words;
int *word_starts; // where we are in the stream
int words_len;
int words_cap;
char *pword_start; // pointer to stream start of current field
int word_start; // position start of current field
int *line_start; // position in words for start of line
int *line_fields; // Number of fields in each line
int lines; // Number of (good) lines observed
int file_lines; // Number of file lines observed (including bad or skipped)
int lines_cap; // Vector capacity
// Tokenizing stuff
ParserState state;
int doublequote; /* is " represented by ""? */
char delimiter; /* field separator */
int delim_whitespace; /* delimit by consuming space/tabs instead */
char quotechar; /* quote character */
char escapechar; /* escape character */
char lineterminator;
int skipinitialspace; /* ignore spaces following delimiter? */
int quoting; /* style of quoting to write */
// krufty, hmm =/
int numeric_field;
char commentchar;
int allow_embedded_newline;
int strict; /* raise exception on bad CSV */
int usecols; // Boolean: 1: usecols provided, 0: none provided
int expected_fields;
int error_bad_lines;
int warn_bad_lines;
// floating point options
char decimal;
char sci;
// thousands separator (comma, period)
char thousands;
int header; // Boolean: 1: has header, 0: no header
int header_start; // header row start
int header_end; // header row end
void *skipset;
int64_t skip_first_N_rows;
int skip_footer;
double (*converter)(const char *, char **, char, char, char, int);
// error handling
char *warn_msg;
char *error_msg;
int skip_empty_lines;
} parser_t;
typedef struct coliter_t {
char **words;
int *line_start;
int col;
} coliter_t;
void coliter_setup(coliter_t *self, parser_t *parser, int i, int start);
coliter_t *coliter_new(parser_t *self, int i);
/* #define COLITER_NEXT(iter) iter->words[iter->line_start[iter->line++] + iter->col] */
// #define COLITER_NEXT(iter) iter.words[iter.line_start[iter.line++] + iter.col]
#define COLITER_NEXT(iter, word) do { \
const int i = *iter.line_start++ + iter.col; \
word = i < *iter.line_start ? iter.words[i]: ""; \
} while(0)
parser_t* parser_new(void);
int parser_init(parser_t *self);
int parser_consume_rows(parser_t *self, size_t nrows);
int parser_trim_buffers(parser_t *self);
int parser_add_skiprow(parser_t *self, int64_t row);
int parser_set_skipfirstnrows(parser_t *self, int64_t nrows);
void parser_free(parser_t *self);
void parser_set_default_options(parser_t *self);
void debug_print_parser(parser_t *self);
int tokenize_nrows(parser_t *self, size_t nrows);
int tokenize_all_rows(parser_t *self);
/*
Have parsed / type-converted a chunk of data and want to free memory from the
token stream
*/
//int clear_parsed_lines(parser_t *self, size_t nlines);
int64_t str_to_int64(const char *p_item, int64_t int_min,
int64_t int_max, int *error, char tsep);
//uint64_t str_to_uint64(const char *p_item, uint64_t uint_max, int *error);
double xstrtod(const char *p, char **q, char decimal, char sci, char tsep, int skip_trailing);
double precise_xstrtod(const char *p, char **q, char decimal, char sci, char tsep, int skip_trailing);
double round_trip(const char *p, char **q, char decimal, char sci, char tsep, int skip_trailing);
//int P_INLINE to_complex(char *item, double *p_real, double *p_imag, char sci, char decimal);
//int P_INLINE to_longlong(char *item, long long *p_value);
//int P_INLINE to_longlong_thousands(char *item, long long *p_value, char tsep);
int to_boolean(const char *item, uint8_t *val);
#endif // _PARSER_COMMON_H_