diff --git a/regression/ansi-c/linker_script_start+end/main.c b/regression/ansi-c/linker_script_start+end/main.c new file mode 100644 index 00000000000..3a83adc35f2 --- /dev/null +++ b/regression/ansi-c/linker_script_start+end/main.c @@ -0,0 +1,12 @@ +extern char src_start[]; +extern char src_end[]; +extern char dst_start[]; + +void *memcpy(void *dest, void *src, unsigned n){ + return (void *)0; +} + +int main(){ + memcpy(dst_start, src_start, (unsigned)src_end - (unsigned)src_start); + return 0; +} diff --git a/regression/ansi-c/linker_script_start+end/script.ld b/regression/ansi-c/linker_script_start+end/script.ld new file mode 100644 index 00000000000..909b7bbd5cd --- /dev/null +++ b/regression/ansi-c/linker_script_start+end/script.ld @@ -0,0 +1,22 @@ +MEMORY { + RAM : ORIGIN = 0x0, LENGTH = 25M +} + +SECTIONS +{ + /* GCC insists on having these */ + .note.gnu.build-id : { } > RAM + .text : { } > RAM + + .src_section : { + src_start = .; + *(.text*) + src_end = .; + } > RAM + + .dst_section : { + dst_start = .; + *(.text*) + dst_end = .; + } > RAM +} diff --git a/regression/ansi-c/linker_script_start+end/test.desc b/regression/ansi-c/linker_script_start+end/test.desc new file mode 100644 index 00000000000..eb50eb8a5c4 --- /dev/null +++ b/regression/ansi-c/linker_script_start+end/test.desc @@ -0,0 +1,17 @@ +CORE +main.c +-o out.gb -T script.ld -nostdlib +^EXIT=0$ +^SIGNAL=0$ +-- +^warning: ignoring +^CONVERSION ERROR$ +-- +Tesing the functionality of goto-cc's linker script parsing +functionality, ensuring that it can get the values of symbols that are +defined in linker scripts. + +This test ensures that goto-cc and ls-parse can: + +- get the value of a symbol whose value indicates the start of a section; +- get the value of a symbol whose value indicates the end of a section. diff --git a/regression/ansi-c/linker_script_start+size/main.c b/regression/ansi-c/linker_script_start+size/main.c new file mode 100644 index 00000000000..1dcd7e9df42 --- /dev/null +++ b/regression/ansi-c/linker_script_start+size/main.c @@ -0,0 +1,12 @@ +extern char src_start[]; +extern char src_size[]; +extern char dst_start[]; + +void *memcpy(void *dest, void *src, unsigned n){ + return (void *)0; +} + +int main(){ + memcpy(dst_start, src_start, (unsigned)src_size); + return 0; +} diff --git a/regression/ansi-c/linker_script_start+size/script.ld b/regression/ansi-c/linker_script_start+size/script.ld new file mode 100644 index 00000000000..57d0610a5f2 --- /dev/null +++ b/regression/ansi-c/linker_script_start+size/script.ld @@ -0,0 +1,25 @@ + +MEMORY { + RAM : ORIGIN = 0x0, LENGTH = 25M +} + +SECTIONS +{ + /* GCC insists on having these */ + .note.gnu.build-id : { } > RAM + .text : { } > RAM + + .src_section : { + src_start = .; + *(.text*) + src_end = .; + } > RAM + + src_size = src_end - src_start; + + .dst_section : { + dst_start = .; + *(.text*) + dst_end = .; + } > RAM +} diff --git a/regression/ansi-c/linker_script_start+size/test.desc b/regression/ansi-c/linker_script_start+size/test.desc new file mode 100644 index 00000000000..34b07f7bc8b --- /dev/null +++ b/regression/ansi-c/linker_script_start+size/test.desc @@ -0,0 +1,19 @@ +CORE +main.c +-o out.gb -T script.ld -nostdlib +^EXIT=0$ +^SIGNAL=0$ +-- +^warning: ignoring +^CONVERSION ERROR$ +-- +Tesing the functionality of goto-cc's linker script parsing +functionality, ensuring that it can get the values of symbols that are +defined in linker scripts. + +This test ensures that goto-cc and ls-parse can: + +- get the value of a symbol whose value indicates the start of a section; +- get the value of a symbol whose value indicates the size of a section, + and whose value has been generated through a basic arithmetic + expression in the linker script. diff --git a/regression/ansi-c/linker_script_symbol-only/main.c b/regression/ansi-c/linker_script_symbol-only/main.c new file mode 100644 index 00000000000..62fc32c179c --- /dev/null +++ b/regression/ansi-c/linker_script_symbol-only/main.c @@ -0,0 +1,6 @@ +extern char sym[]; + +int main(){ + int foo = (int)sym; + return 0; +} diff --git a/regression/ansi-c/linker_script_symbol-only/script.ld b/regression/ansi-c/linker_script_symbol-only/script.ld new file mode 100644 index 00000000000..168494d9c13 --- /dev/null +++ b/regression/ansi-c/linker_script_symbol-only/script.ld @@ -0,0 +1,15 @@ +MEMORY { + RAM : ORIGIN = 0x0, LENGTH = 25M +} + +SECTIONS +{ + /* GCC insists on having these */ + .note.gnu.build-id : { } > RAM + .text : { } > RAM + + .src_section : { + sym = .; + *(.text*) + } > RAM +} diff --git a/regression/ansi-c/linker_script_symbol-only/test.desc b/regression/ansi-c/linker_script_symbol-only/test.desc new file mode 100644 index 00000000000..f5d818e7cf6 --- /dev/null +++ b/regression/ansi-c/linker_script_symbol-only/test.desc @@ -0,0 +1,15 @@ +CORE +main.c +-o out.gb -T script.ld -nostdlib +^EXIT=0$ +^SIGNAL=0$ +-- +^warning: ignoring +^CONVERSION ERROR$ +-- +Tesing the functionality of goto-cc's linker script parsing +functionality, ensuring that it can get the values of symbols that are +defined in linker scripts. + +This test ensures that goto-cc and ls-parse can get the value of a +symbol whose value indicates the start of a section. diff --git a/scripts/ls_parse.py b/scripts/ls_parse.py index 556b55fa50d..5cbaee68319 100755 --- a/scripts/ls_parse.py +++ b/scripts/ls_parse.py @@ -77,7 +77,10 @@ def get_linker_script_data(script): # script is a whitespace. text = " %s" % text - close_brace = re.compile(r"\s}") + # Lex out comments + text = re.sub(r"/\*.*?\*/", " ", text) + + close_brace = re.compile(r"\s}(\s*>\s*\w+)?") memory_cmd = re.compile(r"\sMEMORY\s*{") sections_cmd = re.compile(r"\sSECTIONS\s*{") assign_current = re.compile(r"\s(?P\w+)\s*=\s*\.\s*;") @@ -256,6 +259,7 @@ def close_brace_fun(state, _, buf): sym = state["end-valid"].group("sym") info("'%s' marks the end of section '%s'", sym, sec) state["sections"][sec]["end"] = sym + state["end-valid"] = None else: # Linker script assigned end-of-section to a symbol, but not # the start. this is useless to us. @@ -388,7 +392,7 @@ def symbols_from(object_file): def match_up_addresses(script_data, symbol_table): ret = [] - for data in script_data["sections"].values(): + for name, data in script_data["sections"].items(): ok = False if "size" in data and "start" in data: ok = True @@ -404,6 +408,7 @@ def match_up_addresses(script_data, symbol_table): region["start"] = {"sym": sym, "val": value} if "end" in data and sym == data["end"]: region["end"] = {"sym": sym, "val": value} + region["section"] = name append = False if "size" in region and "start" in region: append = True @@ -424,6 +429,9 @@ def get_region_range(region): e_var = region["end"]["sym"] ret["start"] = start ret["size"] = size + ret["start-symbol"] = s_var + ret["end-symbol"] = e_var + ret["has-end-symbol"] = True ret["annot"] = "__CPROVER_allocated_memory(%s, %d);" % (hex(start), size) ret["commt"] = "from %s to %s" % (s_var, e_var) elif "size" in region: @@ -433,10 +441,14 @@ def get_region_range(region): z_var = region["size"]["sym"] ret["start"] = start ret["size"] = size + ret["start-symbol"] = s_var + ret["size-symbol"] = z_var + ret["has-end-symbol"] = False ret["annot"] = "__CPROVER_allocated_memory(%s, %d);" % (hex(start), size) ret["commt"] = "from %s for %s bytes" % (s_var, z_var) else: raise "Malformatted region\n%s" % str(region) + ret["section"] = region["section"] return ret @@ -502,9 +514,9 @@ def main(): regions = match_up_addresses(script_data, symbol_table) - info(json.dumps(symbol_table, indent=2)) - info(json.dumps(script_data, indent=2)) - info(json.dumps(regions, indent=2)) + info("symbol table %s" % json.dumps(symbol_table, indent=2)) + info("script data %s" % json.dumps(script_data, indent=2)) + info("regions %s" % json.dumps(regions, indent=2)) final = json.dumps(final_json_output(regions, symbol_table), indent=2) diff --git a/src/goto-cc/Makefile b/src/goto-cc/Makefile index 576a5e39505..374ed507b44 100644 --- a/src/goto-cc/Makefile +++ b/src/goto-cc/Makefile @@ -13,6 +13,7 @@ SRC = armcc_cmdline.cpp \ goto_cc_main.cpp \ goto_cc_mode.cpp \ ld_cmdline.cpp \ + linker_script_merge.cpp \ ms_cl_cmdline.cpp \ ms_cl_mode.cpp \ # Empty last line diff --git a/src/goto-cc/gcc_mode.cpp b/src/goto-cc/gcc_mode.cpp index 231baff6eb5..136b29d3489 100644 --- a/src/goto-cc/gcc_mode.cpp +++ b/src/goto-cc/gcc_mode.cpp @@ -19,26 +19,37 @@ Author: CM Wintersteiger, 2006 #include #endif +#include #include #include #include +#include #include #include #include #include +#include + +#include +#include +#include #include #include #include +#include #include #include #include #include #include +#include + +#include #include -#include "compile.h" +#include "linker_script_merge.h" static std::string compiler_name( const cmdlinet &cmdline, @@ -98,6 +109,7 @@ gcc_modet::gcc_modet( produce_hybrid_binary(_produce_hybrid_binary), act_as_ld(base_name=="ld" || base_name.find("goto-ld")!=std::string::npos), + goto_binary_tmp_suffix(".goto-cc-saved"), // Keys are architectures specified in configt::set_arch(). // Values are lists of GCC architectures that can be supplied as @@ -837,7 +849,7 @@ int gcc_modet::run_gcc(const compilet &compiler) return run(new_argv[0], new_argv, cmdline.stdin_file, ""); } -int gcc_modet::gcc_hybrid_binary(const compilet &compiler) +int gcc_modet::gcc_hybrid_binary(compilet &compiler) { { bool have_files=false; @@ -891,17 +903,20 @@ int gcc_modet::gcc_hybrid_binary(const compilet &compiler) << " to generate hybrid binary" << eom; // save the goto-cc output files + std::list goto_binaries; for(std::list::const_iterator it=output_files.begin(); it!=output_files.end(); it++) { - int result=rename(it->c_str(), (*it+".goto-cc-saved").c_str()); + std::string bin_name=*it+goto_binary_tmp_suffix; + int result=rename(it->c_str(), bin_name.c_str()); if(result!=0) { error() << "Rename failed: " << std::strerror(errno) << eom; return result; } + goto_binaries.push_back(bin_name); } std::string objcopy_cmd; @@ -919,6 +934,18 @@ int gcc_modet::gcc_hybrid_binary(const compilet &compiler) int result=run_gcc(compiler); + if(result==0) + { + linker_script_merget ls_merge( + compiler, output_files, goto_binaries, cmdline, gcc_message_handler); + const int fail=ls_merge.add_linker_script_definitions(); + if(fail!=0) + { + error() << "Unable to merge linker script symbols" << eom; + return fail; + } + } + // merge output from gcc with goto-binaries // using objcopy, or do cleanup if an earlier call failed for(std::list::const_iterator @@ -927,7 +954,7 @@ int gcc_modet::gcc_hybrid_binary(const compilet &compiler) it++) { debug() << "merging " << *it << eom; - std::string saved=*it+".goto-cc-saved"; + std::string saved=*it+goto_binary_tmp_suffix; #ifdef __linux__ if(result==0 && !cmdline.isset('c')) diff --git a/src/goto-cc/gcc_mode.h b/src/goto-cc/gcc_mode.h index 497546ff2a0..8d77ed29af1 100644 --- a/src/goto-cc/gcc_mode.h +++ b/src/goto-cc/gcc_mode.h @@ -16,6 +16,7 @@ Date: June 2006 #include +#include "compile.h" #include "goto_cc_mode.h" class compilet; @@ -39,6 +40,9 @@ class gcc_modet:public goto_cc_modet const bool act_as_ld; std::string native_tool_name; + const std::string goto_binary_tmp_suffix; + + /// \brief Associate CBMC architectures with processor names const std::map> arch_map; int preprocess( @@ -50,7 +54,7 @@ class gcc_modet:public goto_cc_modet /// \brief call gcc with original command line int run_gcc(const compilet &compiler); - int gcc_hybrid_binary(const compilet &compiler); + int gcc_hybrid_binary(compilet &compiler); int asm_output( bool act_as_bcc, diff --git a/src/goto-cc/linker_script_merge.cpp b/src/goto-cc/linker_script_merge.cpp new file mode 100644 index 00000000000..f7ee9e0b057 --- /dev/null +++ b/src/goto-cc/linker_script_merge.cpp @@ -0,0 +1,750 @@ +/*******************************************************************\ + +Module: Linker Script Merging + +Author: Kareem Khazem , 2017 + +\*******************************************************************/ + +#include + +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include + +#include "compile.h" +#include "linker_script_merge.h" + +int linker_script_merget::add_linker_script_definitions() +{ + if(!cmdline.isset('T') || elf_binaries.size()!=1) + return 0; + const std::string &elf_file=*elf_binaries.begin(); + const std::string &goto_file=*goto_binaries.begin(); + + jsont data; + std::list linker_defined_symbols; + int fail=get_linker_script_data( + data, linker_defined_symbols, compiler.symbol_table, elf_file); + if(fail!=0) + return fail; + + fail=linker_data_is_malformed(data); + if(fail!=0) + { + error() << "Malformed linker-script JSON document" << eom; + data.output(error()); + return fail; + } + + symbol_tablet original_st; + goto_functionst original_gf; + fail=read_goto_binary(goto_file, original_st, original_gf, + get_message_handler()); + if(fail!=0) + { + error() << "Unable to read goto binary for linker script merging" << eom; + return fail; + } + + fail=1; + linker_valuest linker_values; + const auto &pair=original_gf.function_map.find(CPROVER_PREFIX "initialize"); + if(pair==original_gf.function_map.end()) + { + error() << "No " << CPROVER_PREFIX "initialize found in goto_functions" + << eom; + return fail; + } + fail=ls_data2instructions( + data, + cmdline.get_value('T'), + pair->second.body, + original_st, + linker_values); + if(fail!=0) + { + error() << "Could not add linkerscript defs to __CPROVER_initialize" << eom; + return fail; + } + + fail=goto_and_object_mismatch(linker_defined_symbols, linker_values); + if(fail!=0) + return fail; + + // The keys of linker_values are exactly the elements of + // linker_defined_symbols, so iterate over linker_values from now on. + + fail=pointerize_linker_defined_symbols(original_gf, original_st, + linker_values); + if(fail!=0) + { + error() << "Could not pointerize all linker-defined expressions" << eom; + return fail; + } + + fail=compiler.write_object_file(goto_file, original_st, original_gf); + if(fail!=0) + error() << "Could not write linkerscript-augmented binary" << eom; + return fail; +} + +linker_script_merget::linker_script_merget( + compilet &compiler, + std::list &elf_binaries, + std::list &goto_binaries, + cmdlinet &cmdline, + message_handlert &message_handler) : + messaget(message_handler), compiler(compiler), + elf_binaries(elf_binaries), goto_binaries(goto_binaries), + cmdline(cmdline), + replacement_predicates( + { + replacement_predicatet("address of array's first member", + [](const exprt expr){ return to_symbol_expr(expr.op0().op0()); }, + [](const exprt expr) + { + return expr.id()==ID_address_of && + expr.type().id()==ID_pointer && + + expr.op0().id()==ID_index && + expr.op0().type().id()==ID_unsignedbv && + + expr.op0().op0().id()==ID_symbol && + expr.op0().op0().type().id()==ID_array && + + expr.op0().op1().id()==ID_constant && + expr.op0().op1().type().id()==ID_signedbv; + }), + replacement_predicatet("address of array", + [](const exprt expr){ return to_symbol_expr(expr.op0()); }, + [](const exprt expr) + { + return expr.id()==ID_address_of && + expr.type().id()==ID_pointer && + + expr.op0().id()==ID_symbol && + expr.op0().type().id()==ID_array; + }), + replacement_predicatet("array variable", + [](const exprt expr){ return to_symbol_expr(expr); }, + [](const exprt expr) + { + return expr.id()==ID_symbol && + expr.type().id()==ID_array; + }), + replacement_predicatet("pointer (does not need pointerizing)", + [](const exprt expr){ return to_symbol_expr(expr); }, + [](const exprt expr) + { + return expr.id()==ID_symbol && + expr.type().id()==ID_pointer; + }) + }) +{} + +int linker_script_merget::pointerize_linker_defined_symbols( + goto_functionst &goto_functions, + symbol_tablet &symbol_table, + const linker_valuest &linker_values) +{ + int ret=0; + // First, pointerize the actual linker-defined symbols + for(const auto &pair : linker_values) + { + const auto &entry=symbol_table.symbols.find(pair.first); + if(entry==symbol_table.symbols.end()) + continue; + entry->second.type=pointer_type(char_type()); + entry->second.is_extern=false; + entry->second.value=pair.second.second; + } + + // Next, find all occurrences of linker-defined symbols that are _values_ + // of some symbol in the symbol table, and pointerize them too + for(auto &pair : symbol_table.symbols) + { + std::list to_pointerize; + symbols_to_pointerize(linker_values, pair.second.value, to_pointerize); + + if(to_pointerize.empty()) + continue; + debug() << "Pointerizing the symbol-table value of symbol " << pair.first + << eom; + int fail=pointerize_subexprs_of( + pair.second.value, to_pointerize, linker_values); + if(to_pointerize.empty() && fail==0) + continue; + ret=1; + for(const auto &sym : to_pointerize) + error() << " Could not pointerize '" << sym.get_identifier() + << "' in symbol table entry " << pair.first << ". Pretty:\n" + << sym.pretty() << "\n"; + error() << eom; + } + + // Finally, pointerize all occurrences of linker-defined symbols in the + // goto program + for(auto &gf : goto_functions.function_map) + { + goto_programt &program=gf.second.body; + Forall_goto_program_instructions(iit, program) + { + for(exprt *insts : std::list({&(iit->code), &(iit->guard)})) + { + std::list to_pointerize; + symbols_to_pointerize(linker_values, *insts, to_pointerize); + if(to_pointerize.empty()) + continue; + debug() << "Pointerizing a program expression..." << eom; + int fail=pointerize_subexprs_of(*insts, to_pointerize, linker_values); + if(to_pointerize.empty() && fail==0) + continue; + ret=1; + for(const auto &sym : to_pointerize) + { + error() << " Could not pointerize '" << sym.get_identifier() + << "' in function " << gf.first << ". Pretty:\n" + << sym.pretty() << "\n"; + error().source_location=iit->source_location; + } + error() << eom; + } + } + } + return ret; +} + +int linker_script_merget::replace_expr( + exprt &old_expr, + const linker_valuest &linker_values, + const symbol_exprt &old_symbol, + const irep_idt &ident, + const std::string &shape) +{ + auto it=linker_values.find(ident); + if(it==linker_values.end()) + { + error() << "Could not find a new expression for linker script-defined " + << "symbol '" << ident << "'" << eom; + return 1; + } + symbol_exprt new_expr=it->second.first; + new_expr.add_source_location()=old_symbol.source_location(); + debug() << "Pointerizing linker-defined symbol '" << ident << "' of shape <" + << shape << ">." << eom; + old_expr=new_expr; + return 0; +} + +int linker_script_merget::pointerize_subexprs_of( + exprt &expr, + std::list &to_pointerize, + const linker_valuest &linker_values) +{ + int fail=0, tmp=0; + for(auto const &pair : linker_values) + for(auto const &pattern : replacement_predicates) + { + if(!pattern.match(expr)) + continue; + const symbol_exprt &inner_symbol=pattern.inner_symbol(expr); + if(pair.first!=inner_symbol.get_identifier()) + continue; + tmp=replace_expr(expr, linker_values, inner_symbol, pair.first, + pattern.description()); + fail=tmp?tmp:fail; + auto result=std::find(to_pointerize.begin(), to_pointerize.end(), + inner_symbol); + if(result==to_pointerize.end()) + { + fail=1; + error() << "Too many removals of '" << inner_symbol.get_identifier() + << "'" << eom; + } + else + to_pointerize.erase(result); + // If we get here, we've just pointerized this expression. That expression + // will be a symbol possibly wrapped in some unary junk, but won't contain + // other symbols as subexpressions that might need to be pointerized. So + // it's safe to bail out here. + return fail; + } + + for(auto &op : expr.operands()) + { + tmp=pointerize_subexprs_of(op, to_pointerize, linker_values); + fail=tmp?tmp:fail; + } + return fail; +} + +void linker_script_merget::symbols_to_pointerize( + const linker_valuest &linker_values, + const exprt &expr, + std::list &to_pointerize) const +{ + for(const auto &op : expr.operands()) + { + if(op.id()!=ID_symbol) + continue; + const symbol_exprt &sym_exp=to_symbol_expr(op); + const auto &pair=linker_values.find(sym_exp.get_identifier()); + if(pair!=linker_values.end()) + to_pointerize.push_back(sym_exp); + } + for(const auto &op : expr.operands()) + symbols_to_pointerize(linker_values, op, to_pointerize); +} + +#if 0 +The current implementation of this function is less precise than the + commented-out version below. To understand the difference between these + implementations, consider the following example: + +Suppose we have a section in the linker script, 100 bytes long, where the +address of the symbol sec_start is the start of the section (value 4096) and the +address of sec_end is the end of that section (value 4196). + +The current implementation synthesizes the goto-version of the following C: + + char __sec_array[100]; + char *sec_start=(&__sec_array[0]); + char *sec_end=((&__sec_array[0])+100); + // Yes, it is 100 not 99. We're pointing to the end of the memory occupied + // by __sec_array, not the last element of __sec_array. + +This is imprecise for the following reason: the actual address of the array and +the pointers shall be some random CBMC-internal address, instead of being 4096 +and 4196. The linker script, on the other hand, would have specified the exact +position of the section, and we even know what the actual values of sec_start +and sec_end are from the object file (these values are in the `addresses` list +of the `data` argument to this function). If the correctness of the code depends +on these actual values, then CBMCs model of the code is too imprecise to verify +this. + +The commented-out version of this function below synthesizes the following: + + char *sec_start=4096; + char *sec_end=4196; + __CPROVER_allocated_memory(4096, 100); + +This code has both the actual addresses of the start and end of the section and +tells CBMC that the intermediate region is valid. However, the allocated_memory +macro does not currently allocate an actual object at the address 4096, so +symbolic execution can fail. In particular, the 'allocated memory' is part of +__CPROVER_memory, which does not have a bounded size; this means that (for +example) calls to memcpy or memset fail, because the first and third arguments +do not have know n size. The commented-out implementation should be reinstated +once this limitation of __CPROVER_allocated_memory has been fixed. + +In either case, no other changes to the rest of the code (outside this function) +should be necessary. The rest of this file converts expressions containing the +linker-defined symbol into pointer types if they were not already, and this is +the right behaviour for both implementations. +#endif +int linker_script_merget::ls_data2instructions( + jsont &data, + const std::string &linker_script, + goto_programt &gp, + symbol_tablet &symbol_table, + linker_valuest &linker_values) +#if 1 +{ + goto_programt::instructionst initialize_instructions=gp.instructions; + std::map truncated_symbols; + for(auto &d : data["regions"].array) + { + bool has_end=d["has-end-symbol"].is_true(); + + std::ostringstream array_name; + array_name << CPROVER_PREFIX << "linkerscript-array_" + << d["start-symbol"].value; + if(has_end) + array_name << "-" << d["end-symbol"].value; + + + // Array symbol_exprt + std::size_t array_size=integer2size_t(string2integer(d["size"].value)); + if(array_size > MAX_FLATTENED_ARRAY_SIZE) + { + warning() << "Object section '" << d["section"].value << "' of size " + << array_size << " is too large to model. Truncating to " + << MAX_FLATTENED_ARRAY_SIZE << " bytes" << eom; + array_size=MAX_FLATTENED_ARRAY_SIZE; + if(!has_end) + truncated_symbols[d["size-symbol"].value]=MAX_FLATTENED_ARRAY_SIZE; + } + + constant_exprt array_size_expr=from_integer(array_size, size_type()); + array_typet array_type(char_type(), array_size_expr); + symbol_exprt array_expr(array_name.str(), array_type); + source_locationt array_loc; + + array_loc.set_file(linker_script); + std::ostringstream array_comment; + array_comment << "Object section '" << d["section"].value << "' of size " + << integer2unsigned(array_size) << " bytes"; + array_loc.set_comment(array_comment.str()); + array_expr.add_source_location()=array_loc; + + // Array start address + index_exprt zero_idx(array_expr, from_integer(0, size_type())); + address_of_exprt array_start(zero_idx); + + // Linker-defined symbol_exprt pointing to start address + symbol_exprt start_sym(d["start-symbol"].value, pointer_type(char_type())); + linker_values[d["start-symbol"].value]=std::make_pair(start_sym, + array_start); + + // Since the value of the pointer will be a random CBMC address, write a + // note about the real address in the object file + auto it=std::find_if(data["addresses"].array.begin(), + data["addresses"].array.end(), + [&d](const jsont &add) + { return add["sym"].value==d["start-symbol"].value; }); + if(it==data["addresses"].array.end()) + { + error() << "Start: Could not find address corresponding to symbol '" + << d["start-symbol"].value << "' (start of section)" << eom; + return 1; + } + source_locationt start_loc; + start_loc.set_file(linker_script); + std::ostringstream start_comment; + start_comment << "Pointer to beginning of object section '" + << d["section"].value << "'. Original address in object file" + << " is " << (*it)["val"].value; + start_loc.set_comment(start_comment.str()); + start_sym.add_source_location()=start_loc; + + // Instruction for start-address pointer in __CPROVER_initialize + code_assignt start_assign(start_sym, array_start); + start_assign.add_source_location()=start_loc; + goto_programt::instructiont start_assign_i; + start_assign_i.make_assignment(start_assign); + start_assign_i.source_location=start_loc; + initialize_instructions.push_front(start_assign_i); + + if(has_end) // Same for pointer to end of array + { + plus_exprt array_end(array_start, array_size_expr); + + symbol_exprt end_sym(d["end-symbol"].value, pointer_type(char_type())); + linker_values[d["end-symbol"].value]=std::make_pair(end_sym, array_end); + + auto it=std::find_if(data["addresses"].array.begin(), + data["addresses"].array.end(), + [&d](const jsont &add) + { return add["sym"].value==d["end-symbol"].value; }); + if(it==data["addresses"].array.end()) + { + error() << "Could not find address corresponding to symbol '" + << d["end-symbol"].value << "' (end of section)" << eom; + return 1; + } + source_locationt end_loc; + end_loc.set_file(linker_script); + std::ostringstream end_comment; + end_comment << "Pointer to end of object section '" + << d["section"].value << "'. Original address in object file" + << " is " << (*it)["val"].value; + end_loc.set_comment(end_comment.str()); + end_sym.add_source_location()=end_loc; + + code_assignt end_assign(end_sym, array_end); + end_assign.add_source_location()=end_loc; + goto_programt::instructiont end_assign_i; + end_assign_i.make_assignment(end_assign); + end_assign_i.source_location=end_loc; + initialize_instructions.push_front(end_assign_i); + } + + // Add the array to the symbol table. We don't add the pointer(s) to the + // symbol table because they were already there, but declared as extern and + // probably with a different type. We change the externness and type in + // pointerize_linker_defined_symbols. + symbolt array_sym; + array_sym.name=array_name.str(); + array_sym.pretty_name=array_name.str(); + array_sym.is_lvalue=array_sym.is_static_lifetime=true; + array_sym.type=array_type; + array_sym.location=array_loc; + symbol_table.add(array_sym); + + // Push the array initialization to the front now, so that it happens before + // the initialization of the symbols that point to it. + namespacet ns(symbol_table); + exprt zi=zero_initializer(array_type, array_loc, ns, *message_handler); + code_assignt array_assign(array_expr, zi); + array_assign.add_source_location()=array_loc; + goto_programt::instructiont array_assign_i; + array_assign_i.make_assignment(array_assign); + array_assign_i.source_location=array_loc; + initialize_instructions.push_front(array_assign_i); + } + + // We've added every linker-defined symbol that marks the start or end of a + // region. But there might be other linker-defined symbols with some random + // address. These will have been declared extern too, so we need to give them + // a value also. Here, we give them the actual value that they have in the + // object file, since we're not assigning any object to them. + for(const auto &d : data["addresses"].array) + { + auto it=linker_values.find(irep_idt(d["sym"].value)); + if(it!=linker_values.end()) + // sym marks the start or end of a region; already dealt with. + continue; + + // Perhaps this is a size symbol for a section whose size we truncated + // earlier. + irep_idt symbol_value; + const auto &pair=truncated_symbols.find(d["sym"].value); + if(pair==truncated_symbols.end()) + symbol_value=d["val"].value; + else + { + debug() << "Truncating the value of symbol " << d["sym"].value << " from " + << d["val"].value << " to " << MAX_FLATTENED_ARRAY_SIZE + << " because it corresponds to the size of a too-large section." + << eom; + symbol_value=std::to_string(MAX_FLATTENED_ARRAY_SIZE); + } + + source_locationt loc; + loc.set_file(linker_script); + loc.set_comment("linker script-defined symbol: char *"+ + d["sym"].value+"="+"(char *)"+id2string(symbol_value)+"u;"); + + symbol_exprt lhs(d["sym"].value, pointer_type(char_type())); + + constant_exprt rhs; + rhs.set_value(integer2binary(string2integer(id2string(symbol_value)), + unsigned_int_type().get_width())); + rhs.type()=unsigned_int_type(); + + exprt rhs_tc(rhs); + rhs_tc.make_typecast(pointer_type(char_type())); + + linker_values[irep_idt(d["sym"].value)]=std::make_pair(lhs, rhs_tc); + + code_assignt assign(lhs, rhs_tc); + assign.add_source_location()=loc; + goto_programt::instructiont assign_i; + assign_i.make_assignment(assign); + assign_i.source_location=loc; + initialize_instructions.push_front(assign_i); + } + return 0; +} +#else +{ + goto_programt::instructionst initialize_instructions=gp.instructions; + for(const auto &d : data["regions"].array) + { + code_function_callt f; + code_typet void_t; + void_t.return_type()=empty_typet(); + f.function()=symbol_exprt(CPROVER_PREFIX "allocated_memory", void_t); + unsigned start=safe_string2unsigned(d["start"].value); + unsigned size=safe_string2unsigned(d["size"].value); + constant_exprt first=from_integer(start, size_type()); + constant_exprt second=from_integer(size, size_type()); + code_function_callt::argumentst args={first, second}; + f.arguments()=args; + + source_locationt loc; + loc.set_file(linker_script); + loc.set_comment("linker script-defined region:\n"+d["commt"].value+":\n"+ + d["annot"].value); + f.add_source_location()=loc; + + goto_programt::instructiont i; + i.make_function_call(f); + initialize_instructions.push_front(i); + } + + if(!symbol_table.has_symbol(CPROVER_PREFIX "allocated_memory")) + { + symbolt sym; + sym.name=CPROVER_PREFIX "allocated_memory"; + sym.pretty_name=CPROVER_PREFIX "allocated_memory"; + sym.is_lvalue=sym.is_static_lifetime=true; + code_typet void_t; + void_t.return_type()=empty_typet(); + sym.type=void_t; + symbol_table.add(sym); + } + + for(const auto &d : data["addresses"].array) + { + source_locationt loc; + loc.set_file(linker_script); + loc.set_comment("linker script-defined symbol: char *"+ + d["sym"].value+"="+"(char *)"+d["val"].value+"u;"); + + symbol_exprt lhs(d["sym"].value, pointer_type(char_type())); + + constant_exprt rhs; + rhs.set_value(integer2binary(string2integer(d["val"].value), + unsigned_int_type().get_width())); + rhs.type()=unsigned_int_type(); + + exprt rhs_tc(rhs); + rhs_tc.make_typecast(pointer_type(char_type())); + + linker_values[irep_idt(d["sym"].value)]=std::make_pair(lhs, rhs_tc); + + code_assignt assign(lhs, rhs_tc); + assign.add_source_location()=loc; + goto_programt::instructiont assign_i; + assign_i.make_assignment(assign); + initialize_instructions.push_front(assign_i); + } + return 0; +} +#endif + +int linker_script_merget::get_linker_script_data( + jsont &linker_data, + std::list &linker_defined_symbols, + const symbol_tablet &symbol_table, + const std::string &out_file) +{ + for(auto const &pair : symbol_table.symbols) + if(pair.second.is_extern && pair.second.value.is_nil() + && pair.second.name!="__CPROVER_memory") + linker_defined_symbols.push_back(pair.second.name); + + std::ostringstream linker_def_str; + std::copy(linker_defined_symbols.begin(), linker_defined_symbols.end(), + std::ostream_iterator(linker_def_str, "\n")); + debug() << "Linker-defined symbols: [" << linker_def_str.str() << "]\n" + << eom; + + temporary_filet linker_def_outfile("goto-cc-linker-info", ".json"); + temporary_filet linker_def_infile("goto-cc-linker-defs", ""); + std::ofstream linker_def_file(linker_def_infile()); + linker_def_file << linker_def_str.str(); + linker_def_file.close(); + + std::vector argv= + { + "ls_parse.py", + "--script", cmdline.get_value('T'), + "--object", out_file, + "--sym-file", linker_def_infile(), + "--out-file", linker_def_outfile() + }; + if(cmdline.isset("verbosity")) + { + unsigned verb=safe_string2unsigned(cmdline.get_value("verbosity")); + if(verb>9) + argv.push_back("--very-verbose"); + else if(verb>4) + argv.push_back("--verbose"); + } + + int rc=run(argv[0], argv, linker_def_infile(), linker_def_outfile()); + if(rc!=0) + { + error() << "Problem parsing linker script" << eom; + return rc; + } + + int fail=parse_json(linker_def_outfile(), get_message_handler(), + linker_data); + if(fail!=0) + error() << "Problem parsing linker script JSON data" << eom; + return fail; +} + +int linker_script_merget::goto_and_object_mismatch( + const std::list &linker_defined_symbols, + const linker_valuest &linker_values) +{ + int fail=0; + for(const auto &sym : linker_defined_symbols) + if(linker_values.find(sym)==linker_values.end()) + { + fail=1; + error() << "Variable '" << sym << "' was declared extern but never given " + << "a value, even in a linker script" << eom; + } + + for(const auto &pair : linker_values) + { + auto it=std::find(linker_defined_symbols.begin(), + linker_defined_symbols.end(), pair.first); + if(it==linker_defined_symbols.end()) + { + fail=1; + error() << "Linker script-defined symbol '" << pair.first << "' was " + << "either defined in the C source code, not declared extern in " + << "the C source code, or does not appear in the C source code" + << eom; + } + } + return fail; +} + +int linker_script_merget::linker_data_is_malformed(const jsont &data) const +{ + return (!(data.is_object() && + data.object.find("regions")!=data.object.end() && + data.object.find("addresses")!=data.object.end() && + data["regions"].is_array() && + data["addresses"].is_array() && + std::all_of(data["addresses"].array.begin(), + data["addresses"].array.end(), + [](jsont j) + { + return j.is_object() && + j.object.find("val")!=j.object.end() && + j.object.find("sym")!=j.object.end() && + j["val"].is_number() && + j["sym"].is_string(); + }) && + std::all_of(data["regions"].array.begin(), + data["regions"].array.end(), + [](jsont j) + { + return j.is_object() && + j.object.find("start")!=j.object.end() && + j.object.find("size")!=j.object.end() && + j.object.find("annot")!=j.object.end() && + j.object.find("commt")!=j.object.end() && + j.object.find("start-symbol")!=j.object.end() && + j.object.find("has-end-symbol")!=j.object.end() && + j.object.find("section")!=j.object.end() && + j["start"].is_number() && + j["size"].is_number() && + j["annot"].is_string() && + j["start-symbol"].is_string() && + j["section"].is_string() && + j["commt"].is_string() && + ( (j["has-end-symbol"].is_true() && + j.object.find("end-symbol")!=j.object.end() && + j["end-symbol"].is_string()) + ||(j["has-end-symbol"].is_false() && + j.object.find("size-symbol")!=j.object.end() && + j.object.find("end-symbol")==j.object.end() && + j["size-symbol"].is_string())); + }))); +} diff --git a/src/goto-cc/linker_script_merge.h b/src/goto-cc/linker_script_merge.h new file mode 100644 index 00000000000..fe45e37d6e6 --- /dev/null +++ b/src/goto-cc/linker_script_merge.h @@ -0,0 +1,215 @@ +/// \file linker_script_merge.h +/// \brief Merge linker script-defined symbols into a goto-program +/// \author Kareem Khazem + +#ifndef CPROVER_GOTO_CC_LINKER_SCRIPT_MERGE_H +#define CPROVER_GOTO_CC_LINKER_SCRIPT_MERGE_H + +#include + +#include +#include + +#include "compile.h" +#include "gcc_cmdline.h" + +/// \brief Patterns of expressions that should be replaced +/// +/// Each instance of this class describes an expression 'shape' that should be +/// replaced with a different expression. The intention is that if some +/// expression matches the pattern described by this replacement_predicatet +/// (i.e. replacement_predicatet::match() returns true), then that expression +/// should be replaced by a new expression. +class replacement_predicatet +{ +public: + replacement_predicatet( + const std::string &description, + const std::function inner_symbol, + const std::function match) + : _description(description), + _inner_symbol(inner_symbol), + _match(match) + {} + + /// \brief a textual description of the expression that we're trying to match + const std::string &description() const + { + return _description; + } + + /// \brief Return the underlying symbol of the matched expression + /// \pre replacement_predicatet::match() returned true + const symbol_exprt &inner_symbol(const exprt &expr) const + { + return _inner_symbol(expr); + } + + /// \brief Predicate: does the given expression match an interesting pattern? + /// + /// If this function returns true, the entire expression should be replaced by + /// a pointer whose underlying symbol is the symbol returned by + /// replacement_predicatet::inner_symbol(). + const bool match(const exprt &expr) const + { + return _match(expr); + }; + +private: + std::string _description; + std::function _inner_symbol; + std::function _match; +}; + +/// \brief Synthesise definitions of symbols that are defined in linker scripts +class linker_script_merget:public messaget +{ +public: + /// \brief Add values of linkerscript-defined symbols to the goto-binary + /// \pre There is a single output file in each of `elf_binaries` and + /// `goto_binaries`, and the codebase is being linked with a custom + /// linker script passed to the compiler with the `-T` option. + /// \post The `__CPROVER_initialize` function contains synthesized definitions + /// for all symbols that are declared in the C codebase and defined in + /// the linker script. + /// \post All uses of these symbols throughout the code base are re-typed to + /// match the type of the synthesized definitions. + /// \post The `__CPROVER_initialize` function contains one + /// `__CPROVER_allocated_memory` annotation for each object file section + /// that is specified in the linker script. + int add_linker_script_definitions(); + + typedef std::map> linker_valuest; + + linker_script_merget( + compilet &compiler, + std::list &elf_binaries, + std::list &goto_binaries, + cmdlinet &cmdline, + message_handlert &message_handler); + +protected: + compilet &compiler; + std::list &elf_binaries; + std::list &goto_binaries; + cmdlinet &cmdline; + + /// \brief The "shapes" of expressions to be replaced by a pointer + /// + /// Whenever this linker_script_merget encounters an expression `expr` in the + /// goto-program---if `rp.match(expr)` returns `true` for some `rp` in this + /// list, and the underlying symbol of `expr` is a linker-defined symbol, then + /// `expr` will be replaced by a pointer whose value is taken from the value + /// in the linker script. + std::list replacement_predicates; + + /// \brief Write linker script definitions to `linker_data`. + int get_linker_script_data( + jsont &linker_data, + std::list &linker_defined_symbols, + const symbol_tablet &symbol_table, + const std::string &out_file); + + /// \brief Write a list of definitions derived from `data` into gp's + /// `instructions` member. + /// \pre `data` is in the format verified by #linker_data_is_malformed. + /// \post For every memory region in `data`, a function call to + /// `__CPROVER_allocated_memory` is prepended to + /// `initialize_instructions`. + /// \post For every symbol in `data`, a declaration and initialization of that + /// symbol is prepended to `initialize_instructions`. + /// \post Every symbol in `data` shall be a key in `linker_values`; the value + /// shall be a constant expression with the actual value of the symbol + /// in the linker script. + int ls_data2instructions( + jsont &data, + const std::string &linker_script, + goto_programt &gp, + symbol_tablet &symbol_table, + linker_valuest &linker_values); + + /// \brief convert the type of linker script-defined symbols to `char*` + /// + /// #ls_data2instructions synthesizes definitions of linker script-defined + /// symbols, and types those definitions as `char*`. This means that if those + /// symbols were declared extern with a different type throughout the target + /// codebase, we need to change all expressions of those symbols to have type + /// `char*` within the goto functions---as well as in the symbol table. + /// + /// The 'canonical' way for linker script-defined symbols to be declared + /// within the codebase is as char[] variables, so we take care of converting + /// those into char*s. However, the frontend occasionally converts expressions + /// like &foo into &foo[0] (where foo is an array), so we have to convert + /// expressions like that even when they don't appear in the original + /// codebase. + /// + /// Note that in general, there is no limitation on what type a linker + /// script-defined symbol should be declared as in the C codebase, because we + /// should only ever be reading its address. So this function is incomplete in + /// that it assumes that linker script-defined symbols have been declared as + /// arrays in the C codebase. If a linker script-defined symbol is declared as + /// some other type, that would likely need some custom logic to be + /// implemented in this function. + /// + /// \post The types of linker-script defined symbols in the symbol table have + /// been converted to `char*`. + /// \post Expressions of the shape `&foo[0]`, `&foo`, and `foo`, where `foo` + /// is a linker-script defined symbol with type array, have been + /// converted to `foo` whose type is `char*`. + int pointerize_linker_defined_symbols( + goto_functionst &goto_functions, + symbol_tablet &symbol_table, + const linker_valuest &linker_values); + + /// \param expr an expr whose subexpressions may need to be pointerized + /// \param to_pointerize The symbols that are contained in the subexpressions + /// that we will pointerize. + /// \param linker_values the names of symbols defined in linker scripts. + /// + /// The subexpressions that we pointerize should be in one-to-one + /// correspondence with the symbols in `to_pointerize`. Every time we + /// pointerize an expression containing a symbol in `to_pointerize`, we remove + /// that symbol from `to_pointerize`. Therefore, when this function returns, + /// `to_pointerize` should be empty. If it is not, then the symbol is + /// contained in a subexpression whose shape is not recognised. + int pointerize_subexprs_of( + exprt &expr, + std::list &to_pointerize, + const linker_valuest &linker_values); + + /// \brief do the actual replacement of an expr with a new pointer expr + int replace_expr( + exprt &old_expr, + const linker_valuest &linker_values, + const symbol_exprt &old_symbol, + const irep_idt &ident, + const std::string &shape); + + /// \brief fill `to_pointerize` with names of linker symbols appearing in expr + void symbols_to_pointerize( + const linker_valuest &linker_values, + const exprt &expr, + std::list &to_pointerize) const; + + /// \brief one-to-one correspondence between extern & linker symbols + /// + /// Check that a string is in `linker_defined_symbols` iff it is a key in the + /// `linker_values` map. The error messages of this function describe what it + /// means for this constraint to be violated. + /// + /// \param linker_defined_symbols the list of symbols that were extern with no + /// value in the goto-program) + /// \param linker_values map from the names of linker-defined symbols from + /// the object file, to synthesized values for those + /// linker symbols. + /// \return `1` if there is some mismatch between the list and map, `0` + /// if everything is OK. + int goto_and_object_mismatch( + const std::list &linker_defined_symbols, + const linker_valuest &linker_values); + + /// \brief Validate output of the `scripts/ls_parse.py` tool + int linker_data_is_malformed(const jsont &data) const; +}; + +#endif // CPROVER_GOTO_CC_LINKER_SCRIPT_MERGE_H diff --git a/src/goto-programs/goto_program_template.h b/src/goto-programs/goto_program_template.h index aa863a5dd0f..846039341b4 100644 --- a/src/goto-programs/goto_program_template.h +++ b/src/goto-programs/goto_program_template.h @@ -239,6 +239,7 @@ class goto_program_templatet void make_dead() { clear(DEAD); } void make_atomic_begin() { clear(ATOMIC_BEGIN); } void make_atomic_end() { clear(ATOMIC_END); } + void make_end_function() { clear(END_FUNCTION); } void make_goto(targett _target) { @@ -252,6 +253,18 @@ class goto_program_templatet guard=g; } + void make_assignment(const codeT &_code) + { + clear(ASSIGN); + code=_code; + } + + void make_decl(const codeT &_code) + { + clear(DECL); + code=_code; + } + void make_function_call(const codeT &_code) { clear(FUNCTION_CALL); diff --git a/src/solvers/flattening/boolbv.cpp b/src/solvers/flattening/boolbv.cpp index 58b09b0d1bb..eb9612f76ea 100644 --- a/src/solvers/flattening/boolbv.cpp +++ b/src/solvers/flattening/boolbv.cpp @@ -15,6 +15,7 @@ Author: Daniel Kroening, kroening@kroening.com #include #include #include +#include #include #include #include @@ -688,7 +689,7 @@ bool boolbvt::is_unbounded_array(const typet &type) const return true; if(unbounded_array==unbounded_arrayt::U_AUTO) - if(s>1000) // magic number! + if(s>MAX_FLATTENED_ARRAY_SIZE) return true; return false; diff --git a/src/util/magic.h b/src/util/magic.h new file mode 100644 index 00000000000..b3525c2f199 --- /dev/null +++ b/src/util/magic.h @@ -0,0 +1,12 @@ +/// \file +/// \brief Magic numbers used throughout the codebase +/// \author Kareem Khazem + +#ifndef CPROVER_UTIL_MAGIC_H +#define CPROVER_UTIL_MAGIC_H + +#include + +const std::size_t MAX_FLATTENED_ARRAY_SIZE=1000; + +#endif