Merge pull request diffblue#1832 from diffblue/smt2-backend

tautschnig · web-flow · commit ab1b267ff782 · 2018-02-21T11:07:07.000Z
SMT2 tokenizer
diff --git a/src/solvers/Makefile b/src/solvers/Makefile
@@ -203,6 +203,7 @@ SRC = $(BOOLEFORCE_SRC) \
       smt2/smt2_conv.cpp \
       smt2/smt2_dec.cpp \
       smt2/smt2_parser.cpp \
+      smt2/smt2_tokenizer.cpp \
       smt2/smt2irep.cpp \
       # Empty last line
 
diff --git a/src/solvers/smt2/smt2_tokenizer.cpp b/src/solvers/smt2/smt2_tokenizer.cpp
@@ -0,0 +1,286 @@
+/*******************************************************************\
+
+Module:
+
+Author: Daniel Kroening, kroening@kroening.com
+
+\*******************************************************************/
+
+#include "smt2_tokenizer.h"
+
+#include <istream>
+
+bool smt2_tokenizert::is_simple_symbol_character(char ch)
+{
+  // any non-empty sequence of letters, digits and the characters
+  // ~ ! @ $ % ^ & * _ - + = < > . ? /
+  // that does not start with a digit and is not a reserved word.
+
+  return isalnum(ch) ||
+     ch=='~' || ch=='!' || ch=='@' || ch=='$' || ch=='%' ||
+     ch=='^' || ch=='&' || ch=='*' || ch=='_' || ch=='-' ||
+     ch=='+' || ch=='=' || ch=='<' || ch=='>' || ch=='.' ||
+     ch=='?' || ch=='/';
+}
+
+smt2_tokenizert::tokent smt2_tokenizert::get_simple_symbol()
+{
+  // any non-empty sequence of letters, digits and the characters
+  // ~ ! @ $ % ^ & * _ - + = < > . ? /
+  // that does not start with a digit and is not a reserved word.
+
+  buffer.clear();
+
+  char ch;
+  while(in->get(ch))
+  {
+    if(is_simple_symbol_character(ch))
+    {
+      buffer+=ch;
+    }
+    else
+    {
+      in->unget(); // put back
+      return SYMBOL;
+    }
+  }
+
+  // eof -- this is ok here
+  if(buffer.empty())
+    return END_OF_FILE;
+  else
+    return SYMBOL;
+}
+
+smt2_tokenizert::tokent smt2_tokenizert::get_decimal_numeral()
+{
+  // we accept any sequence of digits and dots
+
+  buffer.clear();
+
+  char ch;
+  while(in->get(ch))
+  {
+    if(isdigit(ch) || ch=='.')
+    {
+      buffer+=ch;
+    }
+    else
+    {
+      in->unget(); // put back
+      return NUMERAL;
+    }
+  }
+
+  // eof -- this is ok here
+  if(buffer.empty())
+    return END_OF_FILE;
+  else
+    return NUMERAL;
+}
+
+smt2_tokenizert::tokent smt2_tokenizert::get_bin_numeral()
+{
+  // we accept any sequence of '0' or '1'
+
+  buffer.clear();
+  buffer+='#';
+  buffer+='b';
+
+  char ch;
+  while(in->get(ch))
+  {
+    if(ch=='0' || ch=='1')
+    {
+      buffer+=ch;
+    }
+    else
+    {
+      in->unget(); // put back
+      return NUMERAL;
+    }
+  }
+
+  // eof -- this is ok here
+  if(buffer.empty())
+    return END_OF_FILE;
+  else
+    return NUMERAL;
+}
+
+smt2_tokenizert::tokent smt2_tokenizert::get_hex_numeral()
+{
+  // we accept any sequence of '0'-'9', 'a'-'f', 'A'-'F'
+
+  buffer.clear();
+  buffer+='#';
+  buffer+='x';
+
+  char ch;
+  while(in->get(ch))
+  {
+    if(isxdigit(ch))
+    {
+      buffer+=ch;
+    }
+    else
+    {
+      in->unget(); // put back
+      return NUMERAL;
+    }
+  }
+
+  // eof -- this is ok here
+  if(buffer.empty())
+    return END_OF_FILE;
+  else
+    return NUMERAL;
+}
+
+smt2_tokenizert::tokent smt2_tokenizert::get_quoted_symbol()
+{
+  // any sequence of printable ASCII characters (including space,
+  // tab, and line-breaking characters) except for the backslash
+  // character \, that starts and ends with | and does not otherwise
+  // contain |
+
+  buffer.clear();
+
+  char ch;
+  while(in->get(ch))
+  {
+    if(ch=='|')
+      return SYMBOL; // done
+
+    buffer+=ch;
+
+    if(ch=='\n')
+      line_no++;
+  }
+
+  // Hmpf. Eof before end of quoted symbol. This is an error.
+  error() << "EOF within quoted symbol" << eom;
+  return ERROR;
+}
+
+smt2_tokenizert::tokent smt2_tokenizert::get_string_literal()
+{
+  buffer.clear();
+
+  char ch;
+  while(in->get(ch))
+  {
+    if(ch=='"')
+    {
+      // quotes may be escaped by repeating
+      if(in->get(ch))
+      {
+        if(ch=='"')
+        {
+        }
+        else
+        {
+          in->unget();
+          return STRING_LITERAL; // done
+        }
+      }
+      else
+        return STRING_LITERAL; // done
+    }
+    buffer+=ch;
+  }
+
+  // Hmpf. Eof before end of string literal. This is an error.
+  error() << "EOF within string literal" << eom;
+  return ERROR;
+}
+
+smt2_tokenizert::tokent smt2_tokenizert::next_token()
+{
+  if(peeked)
+    return peeked=false, token;
+
+  char ch;
+
+  while(in->get(ch))
+  {
+    switch(ch)
+    {
+    case '\n':
+      line_no++;
+      break;
+
+    case ' ':
+    case '\r':
+    case '\t':
+    case static_cast<char>(160): // non-breaking space
+      // skip any whitespace
+      break;
+
+    case ';': // comment
+      // skip until newline
+      while(in->get(ch) && ch!='\n')
+      {
+        // ignore
+      }
+      break;
+
+    case '(':
+      // produce sub-expression
+      return token=OPEN;
+
+    case ')':
+      // done with sub-expression
+      return token=CLOSE;
+
+    case '|': // quoted symbol
+      return token=get_quoted_symbol();
+
+    case '"': // string literal
+      return token=get_string_literal();
+
+    case ':': // keyword
+      return token=get_simple_symbol();
+
+    case '#':
+      if(in->get(ch))
+      {
+        if(ch=='b')
+          return token=get_bin_numeral();
+        else if(ch=='x')
+          return token=get_hex_numeral();
+        else
+        {
+          error() << "unknown numeral token" << eom;
+          return token=ERROR;
+        }
+      }
+      else
+      {
+        error() << "unexpected EOF in numeral token" << eom;
+        return token=ERROR;
+      }
+      break;
+
+    default: // likely a simple symbol or a numeral
+      if(isdigit(ch))
+      {
+        in->unget();
+        return token=get_decimal_numeral();
+      }
+      else if(is_simple_symbol_character(ch))
+      {
+        in->unget();
+        return token=get_simple_symbol();
+      }
+      else
+      {
+        // illegal character, error
+        error() << "unexpected character `" << ch << '\'' << eom;
+        return token=ERROR;
+      }
+    }
+  }
+
+  return token=END_OF_FILE;
+}
diff --git a/src/solvers/smt2/smt2_tokenizer.h b/src/solvers/smt2/smt2_tokenizer.h
@@ -0,0 +1,69 @@
+/*******************************************************************\
+
+Module:
+
+Author: Daniel Kroening, kroening@kroening.com
+
+\*******************************************************************/
+
+#ifndef CPROVER_SOLVERS_SMT2_SMT2_TOKENIZER_H
+#define CPROVER_SOLVERS_SMT2_SMT2_TOKENIZER_H
+
+#include <util/parser.h>
+
+#include <string>
+
+class smt2_tokenizert:public parsert
+{
+public:
+  explicit smt2_tokenizert(std::istream &_in):
+    ok(true), peeked(false), token(NONE)
+  {
+    in=&_in;
+    line_no=1;
+  }
+
+  operator bool()
+  {
+    return ok;
+  }
+
+protected:
+  std::string buffer;
+  bool ok, peeked;
+  using tokent=enum { NONE, END_OF_FILE, ERROR, STRING_LITERAL,
+                      NUMERAL, SYMBOL, OPEN, CLOSE };
+  tokent token;
+
+  tokent next_token();
+
+  tokent peek()
+  {
+    if(peeked)
+      return token;
+    else
+    {
+      next_token();
+      peeked=true;
+      return token;
+    }
+  }
+
+  mstreamt &error()
+  {
+    ok=false;
+    messaget::error().source_location.set_line(line_no);
+    return messaget::error();
+  }
+
+private:
+  tokent get_decimal_numeral();
+  tokent get_hex_numeral();
+  tokent get_bin_numeral();
+  tokent get_simple_symbol();
+  tokent get_quoted_symbol();
+  tokent get_string_literal();
+  static bool is_simple_symbol_character(char);
+};
+
+#endif // CPROVER_SOLVERS_SMT2_SMT2_PARSER_H
diff --git a/src/solvers/smt2/smt2irep.cpp b/src/solvers/smt2/smt2irep.cpp