|
import editdistance |
|
import frontmatter |
|
from hexdump2 import hexdump |
|
import gradio as gr |
|
import json |
|
import shlex |
|
import subprocess |
|
import tempfile |
|
|
|
from dist import levenshtein_with_wildcard, print_match_summary |
|
|
|
description = frontmatter.load("README.md").content |
|
|
|
|
|
def trim(str, n): |
|
return "\n".join(str.splitlines()[n:]) |
|
|
|
|
|
def trim_objdump(str): |
|
return trim(str, 7) |
|
|
|
|
|
def disassemble_bytes(byte_data, architecture, options): |
|
with tempfile.NamedTemporaryFile(suffix=".bin", delete=False) as temp_bin_file: |
|
temp_bin_file.write(byte_data) |
|
temp_bin_file_name = temp_bin_file.name |
|
|
|
disassembly = subprocess.run( |
|
[ |
|
"objdump", |
|
"-D", |
|
"-b", |
|
"binary", |
|
"-m", |
|
architecture, |
|
"-M", |
|
options, |
|
temp_bin_file_name, |
|
], |
|
capture_output=True, |
|
text=True, |
|
).stdout |
|
disassembly = trim_objdump(disassembly) |
|
|
|
return disassembly |
|
|
|
|
|
def compile(compiler, flags, source): |
|
|
|
with tempfile.NamedTemporaryFile(suffix=".c", delete=False) as temp_c_file: |
|
temp_c_file.write(source.encode()) |
|
temp_c_file_name = temp_c_file.name |
|
|
|
|
|
with tempfile.NamedTemporaryFile(suffix=".o", delete=False) as temp_o_file: |
|
temp_o_file_name = temp_o_file.name |
|
|
|
|
|
result = subprocess.run( |
|
[compiler, "-c", temp_c_file_name] |
|
+ shlex.split(flags) |
|
+ ["-o", temp_o_file_name], |
|
capture_output=True, |
|
text=True, |
|
) |
|
compile_output = result.stdout + result.stderr |
|
|
|
if result.returncode == 0: |
|
|
|
|
|
with tempfile.NamedTemporaryFile(suffix=".raw", delete=True) as raw_bytes_file: |
|
subprocess.run( |
|
[ |
|
"objcopy", |
|
"--only-section", |
|
".text", |
|
|
|
"--only-section", |
|
".text.*", |
|
"-O", |
|
"binary", |
|
temp_o_file_name, |
|
raw_bytes_file.name, |
|
] |
|
) |
|
compiled_bytes = raw_bytes_file.read() |
|
|
|
|
|
disassembly = subprocess.run( |
|
["objdump", "-dr", temp_o_file_name], capture_output=True, text=True |
|
).stdout |
|
disassembly = trim_objdump(disassembly) |
|
|
|
|
|
|
|
json_relocs = subprocess.run( |
|
[ |
|
"llvm-readobj-19", |
|
"--elf-output-style=JSON", |
|
"--relocations", |
|
temp_o_file_name, |
|
], |
|
capture_output=True, |
|
text=True, |
|
).stdout |
|
json_relocs = json.loads(json_relocs) |
|
json_relocs = json_relocs[0]["Relocations"] |
|
json_relocs = [r["Relocation"] for d in json_relocs for r in d["Relocs"]] |
|
|
|
json_relocs = [r for r in json_relocs if r["Symbol"]["Name"] != ".text"] |
|
return json_relocs, compiled_bytes, compile_output, disassembly |
|
else: |
|
return None, None, compile_output, None |
|
|
|
|
|
def _reloc_type2size(s): |
|
match s: |
|
case "R_X86_64_PC32": |
|
return 4 |
|
case "R_X86_64_PLT32": |
|
return 4 |
|
case _: |
|
assert False, f"Unknown reloc {s}" |
|
|
|
|
|
def _compute_relocs_byte_range(json_relocs): |
|
relocs_byte_range = [ |
|
range(r["Offset"], r["Offset"] + _reloc_type2size(r["Type"]["Name"])) |
|
for r in json_relocs |
|
] |
|
|
|
relocs_byte_range = [i for r in relocs_byte_range for i in r] |
|
return relocs_byte_range |
|
|
|
|
|
def predict(target_bytes, source, compiler, flags, disasm_arch, disasm_options): |
|
target_bytes = bytes.fromhex(target_bytes) |
|
compiled_relocs, compiled_bytes, compile_output, compiled_disassembly = compile( |
|
compiler, flags, source |
|
) |
|
target_disassembly = disassemble_bytes(target_bytes, disasm_arch, disasm_options) |
|
|
|
if compiled_bytes is not None: |
|
|
|
reloc_edit_distance, reloc_operations = print_match_summary( |
|
target_bytes, |
|
compiled_bytes, |
|
wildcard_offsets_seq2=_compute_relocs_byte_range(compiled_relocs), |
|
) |
|
print(f"reloc_edit_distance: {reloc_edit_distance}") |
|
print(f"reloc operations: {reloc_operations}") |
|
|
|
return ( |
|
hexdump(compiled_bytes, result="return"), |
|
hexdump(target_bytes, result="return"), |
|
editdistance.eval(compiled_bytes, target_bytes), |
|
reloc_edit_distance, |
|
"\n".join(reloc_operations), |
|
compile_output, |
|
compiled_disassembly, |
|
compiled_relocs, |
|
target_disassembly, |
|
) |
|
else: |
|
return ( |
|
"Compilation failed", |
|
hexdump(target_bytes, result="return"), |
|
-1, |
|
None, |
|
None, |
|
compile_output, |
|
compiled_disassembly, |
|
compiled_relocs, |
|
target_disassembly, |
|
) |
|
|
|
|
|
def run(): |
|
demo = gr.Interface( |
|
fn=predict, |
|
description=description, |
|
inputs=[ |
|
gr.Textbox( |
|
lines=10, |
|
label="Bytes of Target Function (in hex)", |
|
value="b8 2a 00 00 00 c3", |
|
), |
|
gr.Textbox( |
|
lines=10, |
|
label="Decompiled C Source Code", |
|
value="int x;\nint foo() { return x; }", |
|
), |
|
gr.Textbox(label="Compiler", value="g++"), |
|
gr.Textbox(label="Compiler Flags", value="-O2"), |
|
gr.Textbox(label="Architecture (objdump -m)", value="i386"), |
|
gr.Textbox(label="Disassembler options (objdump -M)", value="x86-64"), |
|
], |
|
outputs=[ |
|
gr.Textbox(label="Compiled bytes"), |
|
gr.Textbox(label="Target bytes"), |
|
gr.Number(label="Edit distance (lower is better)"), |
|
gr.Number(label="Edit distance (ignoring relocs; lower is better)"), |
|
gr.Textbox(label="Edit description (ignoring relocs)"), |
|
gr.Textbox(label="Compiler Output"), |
|
gr.Textbox(label="Compiled Disassembly"), |
|
gr.JSON(label="Compiled relocations", open=True), |
|
gr.Textbox(label="Target Disassembly"), |
|
], |
|
) |
|
|
|
demo.launch(server_name="0.0.0.0", server_port=7860, show_error=True) |
|
|
|
|
|
run() |
|
|