Skip to content

Commit ab1b267

Browse files
authored
Merge pull request diffblue#1832 from diffblue/smt2-backend
SMT2 tokenizer
2 parents 11f3699 + a0bfd42 commit ab1b267

File tree

4 files changed

+417
-53
lines changed

4 files changed

+417
-53
lines changed

src/solvers/Makefile

+1
Original file line numberDiff line numberDiff line change
@@ -203,6 +203,7 @@ SRC = $(BOOLEFORCE_SRC) \
203203
smt2/smt2_conv.cpp \
204204
smt2/smt2_dec.cpp \
205205
smt2/smt2_parser.cpp \
206+
smt2/smt2_tokenizer.cpp \
206207
smt2/smt2irep.cpp \
207208
# Empty last line
208209

src/solvers/smt2/smt2_tokenizer.cpp

+286
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,286 @@
1+
/*******************************************************************\
2+
3+
Module:
4+
5+
Author: Daniel Kroening, [email protected]
6+
7+
\*******************************************************************/
8+
9+
#include "smt2_tokenizer.h"
10+
11+
#include <istream>
12+
13+
bool smt2_tokenizert::is_simple_symbol_character(char ch)
14+
{
15+
// any non-empty sequence of letters, digits and the characters
16+
// ~ ! @ $ % ^ & * _ - + = < > . ? /
17+
// that does not start with a digit and is not a reserved word.
18+
19+
return isalnum(ch) ||
20+
ch=='~' || ch=='!' || ch=='@' || ch=='$' || ch=='%' ||
21+
ch=='^' || ch=='&' || ch=='*' || ch=='_' || ch=='-' ||
22+
ch=='+' || ch=='=' || ch=='<' || ch=='>' || ch=='.' ||
23+
ch=='?' || ch=='/';
24+
}
25+
26+
smt2_tokenizert::tokent smt2_tokenizert::get_simple_symbol()
27+
{
28+
// any non-empty sequence of letters, digits and the characters
29+
// ~ ! @ $ % ^ & * _ - + = < > . ? /
30+
// that does not start with a digit and is not a reserved word.
31+
32+
buffer.clear();
33+
34+
char ch;
35+
while(in->get(ch))
36+
{
37+
if(is_simple_symbol_character(ch))
38+
{
39+
buffer+=ch;
40+
}
41+
else
42+
{
43+
in->unget(); // put back
44+
return SYMBOL;
45+
}
46+
}
47+
48+
// eof -- this is ok here
49+
if(buffer.empty())
50+
return END_OF_FILE;
51+
else
52+
return SYMBOL;
53+
}
54+
55+
smt2_tokenizert::tokent smt2_tokenizert::get_decimal_numeral()
56+
{
57+
// we accept any sequence of digits and dots
58+
59+
buffer.clear();
60+
61+
char ch;
62+
while(in->get(ch))
63+
{
64+
if(isdigit(ch) || ch=='.')
65+
{
66+
buffer+=ch;
67+
}
68+
else
69+
{
70+
in->unget(); // put back
71+
return NUMERAL;
72+
}
73+
}
74+
75+
// eof -- this is ok here
76+
if(buffer.empty())
77+
return END_OF_FILE;
78+
else
79+
return NUMERAL;
80+
}
81+
82+
smt2_tokenizert::tokent smt2_tokenizert::get_bin_numeral()
83+
{
84+
// we accept any sequence of '0' or '1'
85+
86+
buffer.clear();
87+
buffer+='#';
88+
buffer+='b';
89+
90+
char ch;
91+
while(in->get(ch))
92+
{
93+
if(ch=='0' || ch=='1')
94+
{
95+
buffer+=ch;
96+
}
97+
else
98+
{
99+
in->unget(); // put back
100+
return NUMERAL;
101+
}
102+
}
103+
104+
// eof -- this is ok here
105+
if(buffer.empty())
106+
return END_OF_FILE;
107+
else
108+
return NUMERAL;
109+
}
110+
111+
smt2_tokenizert::tokent smt2_tokenizert::get_hex_numeral()
112+
{
113+
// we accept any sequence of '0'-'9', 'a'-'f', 'A'-'F'
114+
115+
buffer.clear();
116+
buffer+='#';
117+
buffer+='x';
118+
119+
char ch;
120+
while(in->get(ch))
121+
{
122+
if(isxdigit(ch))
123+
{
124+
buffer+=ch;
125+
}
126+
else
127+
{
128+
in->unget(); // put back
129+
return NUMERAL;
130+
}
131+
}
132+
133+
// eof -- this is ok here
134+
if(buffer.empty())
135+
return END_OF_FILE;
136+
else
137+
return NUMERAL;
138+
}
139+
140+
smt2_tokenizert::tokent smt2_tokenizert::get_quoted_symbol()
141+
{
142+
// any sequence of printable ASCII characters (including space,
143+
// tab, and line-breaking characters) except for the backslash
144+
// character \, that starts and ends with | and does not otherwise
145+
// contain |
146+
147+
buffer.clear();
148+
149+
char ch;
150+
while(in->get(ch))
151+
{
152+
if(ch=='|')
153+
return SYMBOL; // done
154+
155+
buffer+=ch;
156+
157+
if(ch=='\n')
158+
line_no++;
159+
}
160+
161+
// Hmpf. Eof before end of quoted symbol. This is an error.
162+
error() << "EOF within quoted symbol" << eom;
163+
return ERROR;
164+
}
165+
166+
smt2_tokenizert::tokent smt2_tokenizert::get_string_literal()
167+
{
168+
buffer.clear();
169+
170+
char ch;
171+
while(in->get(ch))
172+
{
173+
if(ch=='"')
174+
{
175+
// quotes may be escaped by repeating
176+
if(in->get(ch))
177+
{
178+
if(ch=='"')
179+
{
180+
}
181+
else
182+
{
183+
in->unget();
184+
return STRING_LITERAL; // done
185+
}
186+
}
187+
else
188+
return STRING_LITERAL; // done
189+
}
190+
buffer+=ch;
191+
}
192+
193+
// Hmpf. Eof before end of string literal. This is an error.
194+
error() << "EOF within string literal" << eom;
195+
return ERROR;
196+
}
197+
198+
smt2_tokenizert::tokent smt2_tokenizert::next_token()
199+
{
200+
if(peeked)
201+
return peeked=false, token;
202+
203+
char ch;
204+
205+
while(in->get(ch))
206+
{
207+
switch(ch)
208+
{
209+
case '\n':
210+
line_no++;
211+
break;
212+
213+
case ' ':
214+
case '\r':
215+
case '\t':
216+
case static_cast<char>(160): // non-breaking space
217+
// skip any whitespace
218+
break;
219+
220+
case ';': // comment
221+
// skip until newline
222+
while(in->get(ch) && ch!='\n')
223+
{
224+
// ignore
225+
}
226+
break;
227+
228+
case '(':
229+
// produce sub-expression
230+
return token=OPEN;
231+
232+
case ')':
233+
// done with sub-expression
234+
return token=CLOSE;
235+
236+
case '|': // quoted symbol
237+
return token=get_quoted_symbol();
238+
239+
case '"': // string literal
240+
return token=get_string_literal();
241+
242+
case ':': // keyword
243+
return token=get_simple_symbol();
244+
245+
case '#':
246+
if(in->get(ch))
247+
{
248+
if(ch=='b')
249+
return token=get_bin_numeral();
250+
else if(ch=='x')
251+
return token=get_hex_numeral();
252+
else
253+
{
254+
error() << "unknown numeral token" << eom;
255+
return token=ERROR;
256+
}
257+
}
258+
else
259+
{
260+
error() << "unexpected EOF in numeral token" << eom;
261+
return token=ERROR;
262+
}
263+
break;
264+
265+
default: // likely a simple symbol or a numeral
266+
if(isdigit(ch))
267+
{
268+
in->unget();
269+
return token=get_decimal_numeral();
270+
}
271+
else if(is_simple_symbol_character(ch))
272+
{
273+
in->unget();
274+
return token=get_simple_symbol();
275+
}
276+
else
277+
{
278+
// illegal character, error
279+
error() << "unexpected character `" << ch << '\'' << eom;
280+
return token=ERROR;
281+
}
282+
}
283+
}
284+
285+
return token=END_OF_FILE;
286+
}

src/solvers/smt2/smt2_tokenizer.h

+69
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,69 @@
1+
/*******************************************************************\
2+
3+
Module:
4+
5+
Author: Daniel Kroening, [email protected]
6+
7+
\*******************************************************************/
8+
9+
#ifndef CPROVER_SOLVERS_SMT2_SMT2_TOKENIZER_H
10+
#define CPROVER_SOLVERS_SMT2_SMT2_TOKENIZER_H
11+
12+
#include <util/parser.h>
13+
14+
#include <string>
15+
16+
class smt2_tokenizert:public parsert
17+
{
18+
public:
19+
explicit smt2_tokenizert(std::istream &_in):
20+
ok(true), peeked(false), token(NONE)
21+
{
22+
in=&_in;
23+
line_no=1;
24+
}
25+
26+
operator bool()
27+
{
28+
return ok;
29+
}
30+
31+
protected:
32+
std::string buffer;
33+
bool ok, peeked;
34+
using tokent=enum { NONE, END_OF_FILE, ERROR, STRING_LITERAL,
35+
NUMERAL, SYMBOL, OPEN, CLOSE };
36+
tokent token;
37+
38+
tokent next_token();
39+
40+
tokent peek()
41+
{
42+
if(peeked)
43+
return token;
44+
else
45+
{
46+
next_token();
47+
peeked=true;
48+
return token;
49+
}
50+
}
51+
52+
mstreamt &error()
53+
{
54+
ok=false;
55+
messaget::error().source_location.set_line(line_no);
56+
return messaget::error();
57+
}
58+
59+
private:
60+
tokent get_decimal_numeral();
61+
tokent get_hex_numeral();
62+
tokent get_bin_numeral();
63+
tokent get_simple_symbol();
64+
tokent get_quoted_symbol();
65+
tokent get_string_literal();
66+
static bool is_simple_symbol_character(char);
67+
};
68+
69+
#endif // CPROVER_SOLVERS_SMT2_SMT2_PARSER_H

0 commit comments

Comments
 (0)