Skip to content

SMT2 tokenizer #1832

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Feb 21, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions src/solvers/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -203,6 +203,7 @@ SRC = $(BOOLEFORCE_SRC) \
smt2/smt2_conv.cpp \
smt2/smt2_dec.cpp \
smt2/smt2_parser.cpp \
smt2/smt2_tokenizer.cpp \
smt2/smt2irep.cpp \
# Empty last line

Expand Down
286 changes: 286 additions & 0 deletions src/solvers/smt2/smt2_tokenizer.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,286 @@
/*******************************************************************\

Module:

Author: Daniel Kroening, [email protected]

\*******************************************************************/

#include "smt2_tokenizer.h"

#include <istream>

bool smt2_tokenizert::is_simple_symbol_character(char ch)
{
// any non-empty sequence of letters, digits and the characters
// ~ ! @ $ % ^ & * _ - + = < > . ? /
// that does not start with a digit and is not a reserved word.

return isalnum(ch) ||
ch=='~' || ch=='!' || ch=='@' || ch=='$' || ch=='%' ||
ch=='^' || ch=='&' || ch=='*' || ch=='_' || ch=='-' ||
ch=='+' || ch=='=' || ch=='<' || ch=='>' || ch=='.' ||
ch=='?' || ch=='/';
}

smt2_tokenizert::tokent smt2_tokenizert::get_simple_symbol()
{
// any non-empty sequence of letters, digits and the characters
// ~ ! @ $ % ^ & * _ - + = < > . ? /
// that does not start with a digit and is not a reserved word.

buffer.clear();

char ch;
while(in->get(ch))
{
if(is_simple_symbol_character(ch))
{
buffer+=ch;
}
else
{
in->unget(); // put back
return SYMBOL;
}
}

// eof -- this is ok here
if(buffer.empty())
return END_OF_FILE;
else
return SYMBOL;
}

smt2_tokenizert::tokent smt2_tokenizert::get_decimal_numeral()
{
// we accept any sequence of digits and dots

buffer.clear();

char ch;
while(in->get(ch))
{
if(isdigit(ch) || ch=='.')
{
buffer+=ch;
}
else
{
in->unget(); // put back
return NUMERAL;
}
}

// eof -- this is ok here
if(buffer.empty())
return END_OF_FILE;
else
return NUMERAL;
}

smt2_tokenizert::tokent smt2_tokenizert::get_bin_numeral()
{
// we accept any sequence of '0' or '1'

buffer.clear();
buffer+='#';
buffer+='b';

char ch;
while(in->get(ch))
{
if(ch=='0' || ch=='1')
{
buffer+=ch;
}
else
{
in->unget(); // put back
return NUMERAL;
}
}

// eof -- this is ok here
if(buffer.empty())
return END_OF_FILE;
else
return NUMERAL;
}

smt2_tokenizert::tokent smt2_tokenizert::get_hex_numeral()
{
// we accept any sequence of '0'-'9', 'a'-'f', 'A'-'F'

buffer.clear();
buffer+='#';
buffer+='x';

char ch;
while(in->get(ch))
{
if(isxdigit(ch))
{
buffer+=ch;
}
else
{
in->unget(); // put back
return NUMERAL;
}
}

// eof -- this is ok here
if(buffer.empty())
return END_OF_FILE;
else
return NUMERAL;
}

smt2_tokenizert::tokent smt2_tokenizert::get_quoted_symbol()
{
// any sequence of printable ASCII characters (including space,
// tab, and line-breaking characters) except for the backslash
// character \, that starts and ends with | and does not otherwise
// contain |

buffer.clear();

char ch;
while(in->get(ch))
{
if(ch=='|')
return SYMBOL; // done

buffer+=ch;

if(ch=='\n')
line_no++;
}

// Hmpf. Eof before end of quoted symbol. This is an error.
error() << "EOF within quoted symbol" << eom;
return ERROR;
}

smt2_tokenizert::tokent smt2_tokenizert::get_string_literal()
{
buffer.clear();

char ch;
while(in->get(ch))
{
if(ch=='"')
{
// quotes may be escaped by repeating
if(in->get(ch))
{
if(ch=='"')
{
}
else
{
in->unget();
return STRING_LITERAL; // done
}
}
else
return STRING_LITERAL; // done
}
buffer+=ch;
}

// Hmpf. Eof before end of string literal. This is an error.
error() << "EOF within string literal" << eom;
return ERROR;
}

smt2_tokenizert::tokent smt2_tokenizert::next_token()
{
if(peeked)
return peeked=false, token;

char ch;

while(in->get(ch))
{
switch(ch)
{
case '\n':
line_no++;
break;

case ' ':
case '\r':
case '\t':
case static_cast<char>(160): // non-breaking space
// skip any whitespace
break;

case ';': // comment
// skip until newline
while(in->get(ch) && ch!='\n')
{
// ignore
}
break;

case '(':
// produce sub-expression
return token=OPEN;

case ')':
// done with sub-expression
return token=CLOSE;

case '|': // quoted symbol
return token=get_quoted_symbol();

case '"': // string literal
return token=get_string_literal();

case ':': // keyword
return token=get_simple_symbol();

case '#':
if(in->get(ch))
{
if(ch=='b')
return token=get_bin_numeral();
else if(ch=='x')
return token=get_hex_numeral();
else
{
error() << "unknown numeral token" << eom;
return token=ERROR;
}
}
else
{
error() << "unexpected EOF in numeral token" << eom;
return token=ERROR;
}
break;

default: // likely a simple symbol or a numeral
if(isdigit(ch))
{
in->unget();
return token=get_decimal_numeral();
}
else if(is_simple_symbol_character(ch))
{
in->unget();
return token=get_simple_symbol();
}
else
{
// illegal character, error
error() << "unexpected character `" << ch << '\'' << eom;
return token=ERROR;
}
}
}

return token=END_OF_FILE;
}
69 changes: 69 additions & 0 deletions src/solvers/smt2/smt2_tokenizer.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
/*******************************************************************\

Module:

Author: Daniel Kroening, [email protected]

\*******************************************************************/

#ifndef CPROVER_SOLVERS_SMT2_SMT2_TOKENIZER_H
#define CPROVER_SOLVERS_SMT2_SMT2_TOKENIZER_H

#include <util/parser.h>

#include <string>

class smt2_tokenizert:public parsert
{
public:
explicit smt2_tokenizert(std::istream &_in):
ok(true), peeked(false), token(NONE)
{
in=&_in;
line_no=1;
}

operator bool()
{
return ok;
}

protected:
std::string buffer;
bool ok, peeked;
using tokent=enum { NONE, END_OF_FILE, ERROR, STRING_LITERAL,
NUMERAL, SYMBOL, OPEN, CLOSE };
tokent token;

tokent next_token();

tokent peek()
{
if(peeked)
return token;
else
{
next_token();
peeked=true;
return token;
}
}

mstreamt &error()
{
ok=false;
messaget::error().source_location.set_line(line_no);
return messaget::error();
}

private:
tokent get_decimal_numeral();
tokent get_hex_numeral();
tokent get_bin_numeral();
tokent get_simple_symbol();
tokent get_quoted_symbol();
tokent get_string_literal();
static bool is_simple_symbol_character(char);
};

#endif // CPROVER_SOLVERS_SMT2_SMT2_PARSER_H
Loading