544 lines
20 KiB
Python
Executable File
544 lines
20 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
#
|
|
# Dump info about linker script symbols that pertain to addresses and sizes.
|
|
#
|
|
# Author: Kareem Khazem <karkhaz@amazon.com>
|
|
# Copyright Amazon, Inc. 2017
|
|
|
|
|
|
import argparse
|
|
import json
|
|
import logging
|
|
from logging import error, warning, info, debug
|
|
import os
|
|
import re
|
|
import subprocess
|
|
import sys
|
|
import textwrap
|
|
import traceback
|
|
|
|
|
|
def epilog():
|
|
return textwrap.dedent("""
|
|
This script generates a C file containing two kinds of information:
|
|
|
|
- The values of symbols that are defined in a linker script; these
|
|
are printed as C definitions, like
|
|
char *bss_start = (char *)4070047185u;
|
|
|
|
- The extent of ELF sections that are defined in a linker script;
|
|
these are printed as CPROVER annotations, like
|
|
__CPROVER_allocated_memory(0xe9fda44b, 4096);
|
|
|
|
A goto-binary of this C file can be linked into the rest of the
|
|
codebase that you wish to analyse. This provides CPROVER with
|
|
definitions that it otherwise would not have access to, since they
|
|
are defined in a linker script rather than C code. This information
|
|
can also be printed in JSON rather than as a C file, so that CPROVER
|
|
can invoke and use this script without user intervention.
|
|
|
|
This script needs a list of symbols declared but never defined in C
|
|
code. The hacky way of supplying this list is by passing the path to
|
|
the codebase with the --dir flag; this script will scan the codebase
|
|
for extern-declared variables. A better way is to generate that list
|
|
with CPROVER, and pass that list in using --sym-file. The argument
|
|
to --sym-file can be a filename, or '-' for stdin.
|
|
""")
|
|
|
|
|
|
"""`Running-regex' linker script parser. We don't currently use a full
|
|
grammar, as we only need a fraction of the information contained in
|
|
linker scripts to give to CBMC. If in the future we need a more
|
|
sophisticated parser, we should use an actual grammar from a real
|
|
parser. GNU LD uses a YACC/Flex setup and has a very complete grammar,
|
|
but we cannot use it (GPL 3). LLD (the LLVM project's linker script
|
|
parser) is hand-written (so no explicit grammar), but they do not aim to
|
|
support all of GNU LD's syntax, so LLD doesn't work on some real linker
|
|
scripts. So in summary: use this regex parser while it's practical;
|
|
switch to LLD when needed, and possibly contribute to LLD development to
|
|
support parsing your use case."""
|
|
def get_linker_script_data(script):
|
|
try:
|
|
with open(script) as f:
|
|
lines = f.read().splitlines()
|
|
except IOError:
|
|
error("Linker script '%s' not found", script)
|
|
exit(1)
|
|
|
|
text = " ".join(lines)
|
|
text = re.sub(r"\s+", " ", text)
|
|
|
|
# In these regexes, we always start by matching a whitespace. This
|
|
# is so that we don't match every substring of an identifier. i.e.
|
|
# if we have a section definition .text : { ..., then we only want
|
|
# to recognise a section called ".text", and not also "text", "ext",
|
|
# "xt", and "t".
|
|
#
|
|
# Just to be safe, ensure that the first character of the linker
|
|
# script is a whitespace.
|
|
text = " %s" % text
|
|
|
|
# Lex out comments
|
|
text = re.sub(r"/\*.*?\*/", " ", text)
|
|
|
|
close_brace = re.compile(r"\s}(\s*>\s*\w+)?")
|
|
uwnknown_cmd = re.compile(r"\sPHDRS\s*{") # only this pattern for now, more might follow!
|
|
memory_cmd = re.compile(r"\sMEMORY\s*{")
|
|
sections_cmd = re.compile(r"\sSECTIONS\s*{")
|
|
assign_current = re.compile(r"\s(?P<sym>\w+)\s*=\s*\.\s*;")
|
|
sec_def = re.compile(r"\s(?P<sec>([-\.\w]+)|(/DISCARD/))"
|
|
r"\s+([^:{};]*?):([^:{};])*?{")
|
|
assign_size = re.compile(r"\s(?P<sym>\w+)\s*=\s*SIZEOF\("
|
|
r"(?P<sec>\.\w+)\)\s*;")
|
|
memory_block = re.compile(r"\s(?P<name>\w+)\s*:\s*ORIGIN\s*=\s*"
|
|
r"(?P<orig>0x[a-fA-F0-9]+)\s*,\s*"
|
|
r"LENGTH\s*=\s*(?P<len>\d+)\s*"
|
|
r"(?P<unit>[KMG])")
|
|
exp = r"(ORIGIN\(\w+\)|LENGTH\(\w+\))"
|
|
op = r"(\+|\-)"
|
|
assign_expr = re.compile(r"\s(?P<sym>\w+)\s*=\s*"
|
|
r"(?P<expr>{exp}(\s*{op}\s*{exp})*)"
|
|
r"\s*;".format(exp=exp, op=op))
|
|
|
|
# If we match a regex, call the right function to update the state
|
|
# with the info gleaned from the matched string.
|
|
jump_table = {
|
|
close_brace : close_brace_fun,
|
|
uwnknown_cmd : unknown_cmd_fun,
|
|
memory_cmd : memory_cmd_fun,
|
|
sections_cmd : sections_cmd_fun,
|
|
assign_current : assign_current_fun,
|
|
memory_block : memory_block_fun,
|
|
sec_def : sec_def_fun,
|
|
assign_size : assign_size_fun,
|
|
assign_expr : assign_expr_fun,
|
|
}
|
|
|
|
# Whenever we match an interesting regex, we'll update the state
|
|
# with whatever information we want to rip from that bit of text.
|
|
state = {}
|
|
|
|
# The section definition that we were last in.
|
|
state["cur-sec"] = None
|
|
|
|
# If we know what section *start* the current address (.) points to,
|
|
# then this will not be None. It's used to match an assignment to
|
|
# the start of a section.
|
|
state["start-valid"] = None
|
|
|
|
# If we have just seen an assignment, then this will not be None.
|
|
# It's used to match up an assignment with the end of a section.
|
|
state["end-valid"] = None
|
|
|
|
# Each entry here maps a section name to a map. That map maps "size"
|
|
# to the symbol pointing to the size of the section, and "start"
|
|
# to the symbol pointing to the start address of the section. One of
|
|
# "start" or "size" may be absent, if we couldn't get that bit of
|
|
# information from the linker script.
|
|
state["sections"] = {}
|
|
|
|
# We can use the list of valid addresses to sanity-check that the
|
|
# start addresses of sections are genuinely addresses.
|
|
state["valid-addresses"] = []
|
|
|
|
# Symbols that get some expression assigned to them, either inside
|
|
# or outside a section definition. We'll match them up later.
|
|
state["expr-assigns"] = []
|
|
|
|
# These are to sanity-check the parsing.
|
|
state["MEM"] = False
|
|
state["SEC"] = False
|
|
state["DEF"] = False
|
|
state["UNKNOWN"] = False
|
|
|
|
i = 0
|
|
while i < len(text):
|
|
buf = text[i:]
|
|
i += 1
|
|
asrt(not (state["MEM"] and state["SEC"]),
|
|
"memory & sections", buf)
|
|
asrt(not state["DEF"] or state["SEC"],
|
|
"def outside SECTION", buf)
|
|
|
|
jump_fun = None
|
|
matched_str = None
|
|
matched_re = None
|
|
match = None
|
|
for regex, fun in jump_table.items():
|
|
m = regex.match(buf)
|
|
if m:
|
|
if jump_fun is not None:
|
|
error("matched multiple regexes\n%s", buf)
|
|
exit(1)
|
|
jump_fun = fun
|
|
match = m
|
|
matched_str = buf[m.span()[0]:m.span()[1]]
|
|
for s, v in locals().items():
|
|
if v is regex and s is not "regex":
|
|
matched_re = s
|
|
if jump_fun is not None:
|
|
info("regex '%s' matched '%s'", matched_re, matched_str)
|
|
jump_fun(state, match, buf)
|
|
i = i + match.span()[1] - 1
|
|
else:
|
|
debug("Clobbering due to '%s'...", buf[:20])
|
|
# There may have been some intermediate command between the
|
|
# start of a section definition and where we are. So we have
|
|
# no idea what address the current address pointer refers to
|
|
state["start-valid"] = None
|
|
# There may have been an intermediate command between the
|
|
# last assignment and the end of the section.
|
|
state["end-valid"] = None
|
|
match_up_expr_assigns(state)
|
|
return state
|
|
|
|
|
|
def assign_expr_fun(state, match, _):
|
|
# Do NOT invalidate 'start-valid' here. Assignments from expressions
|
|
# do not actually advance the current address pointer.
|
|
sym, expr = match.group("sym"), match.group("expr")
|
|
origin_pat = r"ORIGIN\((?P<block>\w+?)\)"
|
|
origins = re.findall(origin_pat, expr)
|
|
if len(origins) != 1:
|
|
info("assign with %d origins, skipping: %s", len(origins),
|
|
match.string())
|
|
return
|
|
ret = {"origin": origins[0], "sym": sym}
|
|
for block_name, data in state["blocks"].items():
|
|
for op in ["ORIGIN", "LENGTH"]:
|
|
old_expr = str(expr)
|
|
expr = re.sub(r"%s\(%s\)" % (op, block_name), str(data[op]),
|
|
expr)
|
|
if expr != old_expr:
|
|
info("Subbed %s(%s) with %d", op, block_name, data[op])
|
|
info("Final expression is '%s'. Evaluating; "
|
|
"may the angels have mercy on my soul.", expr)
|
|
try:
|
|
ret["addr"] = eval(expr)
|
|
except RuntimeError:
|
|
warning("Unable to evaluate '%s'" , expr)
|
|
info("Evaluated '%s' to %d", expr, ret["addr"])
|
|
state["expr-assigns"].append(ret)
|
|
|
|
|
|
def sec_def_fun(state, match, buf):
|
|
asrt(not state["DEF"], "nested sec def", buf)
|
|
state["DEF"] = True
|
|
sec = match.group("sec")
|
|
info("Current section is now '%s'", sec)
|
|
state["cur-sec"] = sec
|
|
state["start-valid"] = True
|
|
|
|
|
|
def assign_size_fun(state, match, buf):
|
|
asrt(state["SEC"], "assignment outside SECTIONS", buf)
|
|
sec = match.group("sec")
|
|
if sec not in state["sections"]:
|
|
state["sections"][sec] = {}
|
|
sym = match.group("sym")
|
|
info("'%s' marks the size of section '%s'", sym, sec)
|
|
state["sections"][sec]["size"] = sym
|
|
|
|
|
|
def assign_current_fun(state, match, buf):
|
|
asrt(state["SEC"], "assignment outside SECTIONS", buf)
|
|
sec = state["cur-sec"]
|
|
state["end-valid"] = match
|
|
if state["start-valid"]:
|
|
if sec not in state["sections"]:
|
|
state["sections"][sec] = {}
|
|
sym = match.group("sym")
|
|
info("'%s' marks the start of section '%s'", sym, sec)
|
|
state["sections"][sec]["start"] = sym
|
|
else:
|
|
info("Don't know where we are.")
|
|
|
|
|
|
def close_brace_fun(state, _, buf):
|
|
# We might have seen an assignment immediately before this.
|
|
if state["end-valid"]:
|
|
asrt(state["DEF"], "end-valid outside sec-def", buf)
|
|
sec = state["cur-sec"]
|
|
if sec in state["sections"]:
|
|
sym = state["end-valid"].group("sym")
|
|
info("'%s' marks the end of section '%s'", sym, sec)
|
|
state["sections"][sec]["end"] = sym
|
|
state["end-valid"] = None
|
|
else:
|
|
# Linker script assigned end-of-section to a symbol, but not
|
|
# the start. this is useless to us.
|
|
pass
|
|
|
|
if state["DEF"]:
|
|
info("Closing sec-def")
|
|
state["DEF"] = False
|
|
elif state["SEC"]:
|
|
info("Closing sections")
|
|
state["SEC"] = False
|
|
elif state["MEM"]:
|
|
info("Closing memory command")
|
|
state["MEM"] = False
|
|
elif state["UNKNOWN"]:
|
|
info("Closing unknown command")
|
|
state["UNKNOWN"] = False
|
|
else:
|
|
error("Not in block\n%s", buf)
|
|
traceback.print_stack()
|
|
exit(1)
|
|
|
|
|
|
def memory_block_fun(state, m, buf):
|
|
asrt(state["MEM"], "memory block outside MEMORY", buf)
|
|
start, length, unit = m.group("orig"), m.group("len"), m.group("unit")
|
|
length = int(length)
|
|
dec_start = int(start, 16)
|
|
mul = {"K": 1000, "M": 1000000, "G": 1000000000}
|
|
length = length * mul[unit]
|
|
end = dec_start + length
|
|
info("mem block %s from %d to %d (%d%s long)", start, dec_start, end,
|
|
length, unit)
|
|
state["valid-addresses"].append({"from": dec_start, "to": end})
|
|
|
|
name = m.group("name")
|
|
if "blocks" not in state:
|
|
state["blocks"] = {}
|
|
state["blocks"][name] = {"ORIGIN": int(start, 16), "LENGTH": length}
|
|
|
|
|
|
def sections_cmd_fun(state, _, buf):
|
|
asrt(not state["SEC"], "encountered SECTIONS twice", buf)
|
|
state["SEC"] = True
|
|
|
|
|
|
def memory_cmd_fun(state, _, buf):
|
|
asrt(not state["MEM"], "encountered MEMORY twice", buf)
|
|
state["MEM"] = True
|
|
|
|
def unknown_cmd_fun(state, _, buf):
|
|
asrt(not state["MEM"], "encountered UNKNOWN twice", buf)
|
|
state["UNKNOWN"] = True
|
|
|
|
def match_up_expr_assigns(state):
|
|
blocks = set([data["origin"] for data in state["expr-assigns"]])
|
|
for block in blocks:
|
|
assigns = [a for a in state["expr-assigns"]
|
|
if a["origin"] == block]
|
|
assigns = sorted(assigns, key=(lambda a: a["addr"]))
|
|
if len(assigns) < 2:
|
|
warning("Only 1 assignment to expr involving %s", block)
|
|
continue
|
|
start_addr, end_addr = assigns[0]["addr"], assigns[-1]["addr"]
|
|
start_sym, end_sym = assigns[0]["sym"], assigns[-1]["sym"]
|
|
info("Valid memory from %s (%d) to %s (%s) [%s block]",
|
|
start_sym, start_addr, end_sym, end_addr, block)
|
|
tmp = {"start": start_sym, "end": end_sym}
|
|
sec_name = "BLOCK_%s_%s-%s" % (block, start_sym, end_sym)
|
|
state["sections"][sec_name] = tmp
|
|
|
|
|
|
def asrt(cond, msg, buf):
|
|
if not cond:
|
|
error("%s\n%s", msg, buf)
|
|
exit(1)
|
|
|
|
|
|
def needed_definitions(all_symbols, root_dir):
|
|
ret = []
|
|
pat = re.compile(r"extern\s+char\s+(?P<var>\w+)\[\];")
|
|
allowed = [".c", ".cpp", ".h"]
|
|
for root, _, files in os.walk(root_dir):
|
|
for file in files:
|
|
file = os.path.join(root, file)
|
|
_, ext = os.path.splitext(file)
|
|
if ext not in allowed:
|
|
continue
|
|
with open(file) as f:
|
|
for line in f:
|
|
m = pat.match(line)
|
|
if m:
|
|
ret.append(m.group("var"))
|
|
bad = [v for v in ret if v not in all_symbols]
|
|
if bad:
|
|
logging.error("These symbols need definitions but are not "
|
|
"in the object file: %s", ", ".join(bad))
|
|
exit(1)
|
|
logging.info("need symbols:\n%s", "\n".join(ret))
|
|
return ret
|
|
|
|
|
|
def symbols_from(object_file):
|
|
cmd = ["objdump", "--syms", object_file]
|
|
proc = subprocess.Popen(cmd, universal_newlines=True,
|
|
stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
|
|
if proc.wait():
|
|
logging.error("`%s` failed. Output:\n%s", " ".join(cmd),
|
|
proc.stdout.read())
|
|
exit(1)
|
|
pat = re.compile(r"(?P<addr>[^\s]+)\s+"
|
|
r"(?P<flags>[lgu! ][w ][C ][W ][Ii ][Dd ][FfO ])\s+"
|
|
r"(?P<section>[^\s]+)\s+"
|
|
r"(?P<size>[0-9a-f]+)\s+"
|
|
r"(?P<name>[^\s]*)" # Can be empty!
|
|
)
|
|
matching = False
|
|
ret = {}
|
|
for line in proc.stdout.read().splitlines():
|
|
if not line:
|
|
continue
|
|
if not matching and re.match("SYMBOL TABLE:", line):
|
|
matching = True
|
|
continue
|
|
if not matching:
|
|
continue
|
|
m = pat.match(line)
|
|
if not m:
|
|
logging.error("Unexpected line from `%s`:\n%s",
|
|
" ".join(cmd), line)
|
|
exit(1)
|
|
ret[m.group("name")] = m.group("addr")
|
|
logging.info("found symbols:\n%s", "\n".join(
|
|
["0x%-16s %s" % (v, k) for k, v in ret.items()]))
|
|
return ret
|
|
|
|
|
|
def match_up_addresses(script_data, symbol_table):
|
|
ret = []
|
|
for name, data in script_data["sections"].items():
|
|
ok = False
|
|
if "size" in data and "start" in data:
|
|
ok = True
|
|
if "end" in data and "start" in data:
|
|
ok = True
|
|
if not ok:
|
|
continue
|
|
region = {}
|
|
for sym, value in symbol_table.items():
|
|
if "size" in data and sym == data["size"]:
|
|
region["size"] = {"sym": sym, "val": value}
|
|
if "start" in data and sym == data["start"]:
|
|
region["start"] = {"sym": sym, "val": value}
|
|
if "end" in data and sym == data["end"]:
|
|
region["end"] = {"sym": sym, "val": value}
|
|
region["section"] = name
|
|
append = False
|
|
if "size" in region and "start" in region:
|
|
append = True
|
|
if "end" in region and "start" in region:
|
|
append = True
|
|
if append:
|
|
ret.append(region)
|
|
return ret
|
|
|
|
|
|
def get_region_range(region):
|
|
ret = {}
|
|
if "end" in region:
|
|
start = int(region["start"]["val"], 16)
|
|
end = int(region["end"]["val"], 16)
|
|
size = end - start
|
|
s_var = region["start"]["sym"]
|
|
e_var = region["end"]["sym"]
|
|
ret["start"] = start
|
|
ret["size"] = size
|
|
ret["start-symbol"] = s_var
|
|
ret["end-symbol"] = e_var
|
|
ret["has-end-symbol"] = True
|
|
ret["annot"] = "__CPROVER_allocated_memory(%s, %d);" % (hex(start), size)
|
|
ret["commt"] = "from %s to %s" % (s_var, e_var)
|
|
elif "size" in region:
|
|
start = int(region["start"]["val"], 16)
|
|
size = int(region["size"]["val"], 16)
|
|
s_var = region["start"]["sym"]
|
|
z_var = region["size"]["sym"]
|
|
ret["start"] = start
|
|
ret["size"] = size
|
|
ret["start-symbol"] = s_var
|
|
ret["size-symbol"] = z_var
|
|
ret["has-end-symbol"] = False
|
|
ret["annot"] = "__CPROVER_allocated_memory(%s, %d);" % (hex(start), size)
|
|
ret["commt"] = "from %s for %s bytes" % (s_var, z_var)
|
|
else:
|
|
raise "Malformatted region\n%s" % str(region)
|
|
ret["section"] = region["section"]
|
|
return ret
|
|
|
|
|
|
def final_json_output(regions, symbol_table):
|
|
ret = {"regions": [], "addresses": []}
|
|
for s, v in symbol_table.items():
|
|
ret["addresses"].append({"sym": s, "val": int(v, 16)})
|
|
for region in regions:
|
|
ret["regions"].append(get_region_range(region))
|
|
return ret
|
|
|
|
|
|
|
|
def symbols_from_file(sym_file):
|
|
if sym_file == "-":
|
|
return [s.strip() for s in sys.stdin.readlines()]
|
|
else:
|
|
with open(sym_file) as f:
|
|
return [s.strip() for s in f.readlines()]
|
|
|
|
|
|
def main():
|
|
pars = argparse.ArgumentParser(
|
|
description="Generate info about linker-defined symbols and regions.",
|
|
epilog=epilog(),
|
|
formatter_class=argparse.RawDescriptionHelpFormatter)
|
|
pars.add_argument("-s", "--script", metavar="S", required=True,
|
|
help="path to linker script")
|
|
pars.add_argument("-o", "--object", metavar="O", required=True,
|
|
help="path to fully-linked binary")
|
|
|
|
sym_source = pars.add_mutually_exclusive_group(required=True)
|
|
sym_source.add_argument("-d", "--dir", metavar="D",
|
|
help="path to top-level of codebase")
|
|
sym_source.add_argument("-i", "--sym-file",
|
|
metavar="F", help="file of names of linker symbols")
|
|
|
|
pars.add_argument("-t", "--out-file", metavar="F",
|
|
help="default: stdout", default=None)
|
|
|
|
verbs = pars.add_mutually_exclusive_group()
|
|
verbs.add_argument("-v", "--verbose", action="store_true")
|
|
verbs.add_argument("-w", "--very-verbose", action="store_true")
|
|
|
|
args = pars.parse_args()
|
|
|
|
if args.verbose:
|
|
lvl = logging.INFO
|
|
elif args.very_verbose:
|
|
lvl = logging.DEBUG
|
|
else:
|
|
lvl = logging.WARNING
|
|
form = "linkerscript parse %(levelname)s: %(message)s"
|
|
logging.basicConfig(format=form, level=lvl)
|
|
|
|
script_data = get_linker_script_data(args.script)
|
|
symbol_table = symbols_from(args.object)
|
|
if args.dir:
|
|
needed = needed_definitions(symbol_table.keys(), args.dir)
|
|
else:
|
|
needed = symbols_from_file(args.sym_file)
|
|
symbol_table = {k:v for k, v in symbol_table.items() if k in needed}
|
|
|
|
regions = match_up_addresses(script_data, symbol_table)
|
|
|
|
info("symbol table %s" % json.dumps(symbol_table, indent=2))
|
|
info("script data %s" % json.dumps(script_data, indent=2))
|
|
info("regions %s" % json.dumps(regions, indent=2))
|
|
|
|
final = json.dumps(final_json_output(regions, symbol_table),
|
|
indent=2)
|
|
if args.out_file:
|
|
with open(args.out_file, "w") as f:
|
|
f.write(final)
|
|
info(final)
|
|
else:
|
|
print(final)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|