LLM4Decompile
LLM4Decompile copied to clipboard
Concern Regarding Dataset Integrity
Upon thorough examination, it has come to my attention that the dataset's integrity might be compromised due to the methodology employed in generating the assembly representations. Specifically, the use of object files instead of fully linked binaries introduces inaccuracies, particularly concerning external function calls and the handling of immediate values.
The absence of the linking process results in disassemblies where immediate numbers for external function calls are left blank, leading to misleading representations. Each call to an external function is disassembled to call the next instruction, which can severely impact the model's ability to distinguish between different external function calls.
For example, in your decompile-eval.json
, line:294, task 10, O1, the function with strlen, malloc and strncpy results in using the following disassembly as the input, those callq
s do not point to the correct location. Even state of the art decompilers cannot decompile those assembly (when the object files are stripped and correct values are not filled into those calls).
endbr64
push %r15
push %r14
push %r13
push %r12
push %rbp
push %rbx
sub $0x18,%rsp
mov %rdi,%rbp
mov $0xffffffffffffffff,%rcx
mov $0x0,%eax
repnz scas %es:(%rdi),%al
mov %rcx,%rax
not %rax
lea -0x1(%rax),%r12
lea (%r12,%r12,1),%r15d
lea 0x1(%r15),%edi
movslq %edi,%rdi
callq 3d <func0+0x3d>
mov %rax,%r14
test %rax,%rax
je ca <func0+0xca>
mov %r12d,%r13d
test %r12d,%r12d
jle 7a <func0+0x7a>
mov %r12d,%r9d
lea -0x1(%r12),%eax
mov %eax,0xc(%rsp)
mov %eax,%r8d
mov %rbp,%rsi
mov $0x0,%ebx
movslq %r12d,%rdi
sub $0x1,%rdi
jmp e8 <func0+0xe8>
mov 0xc(%rsp),%ebx
jmpq 11f <func0+0x11f>
movslq %r12d,%rdx
mov %rbp,%rsi
mov %rax,%rdi
callq 88 <func0+0x88>
jmp c2 <func0+0xc2>
movslq %ebx,%rbx
mov %rbx,%rdx
mov %rbp,%rsi
mov %r14,%rdi
callq 9b <func0+0x9b>
lea -0x1(%rbp,%rbx,1),%rax
lea (%r14,%rbx,1),%rdx
lea -0x2(%rbp,%rbx,1),%rsi
mov 0xc(%rsp),%ecx
sub %rcx,%rsi
movzbl (%rax),%ecx
mov %cl,(%rdx)
sub $0x1,%rax
add $0x1,%rdx
cmp %rsi,%rax
jne b0 <func0+0xb0>
movslq %r15d,%r15
movb $0x0,(%r14,%r15,1)
mov %r14,%rax
add $0x18,%rsp
pop %rbx
pop %rbp
pop %r12
pop %r13
pop %r14
pop %r15
retq
add $0x1,%ebx
add $0x1,%rsi
cmp %ebx,%r13d
je 8a <func0+0x8a>
mov %r9d,%eax
sub %ebx,%eax
mov %eax,%ecx
shr $0x1f,%ecx
add %eax,%ecx
sar %ecx
cmp %r8d,%ebx
je 71 <func0+0x71>
lea 0x0(%rbp,%rdi,1),%rdx
mov $0x0,%eax
movzbl (%rdx),%r10d
cmp %r10b,(%rsi,%rax,1)
jne dc <func0+0xdc>
add $0x1,%rax
sub $0x1,%rdx
cmp %eax,%ecx
jg 109 <func0+0x109>
movslq %r12d,%r13
mov %r13,%rdx
mov %rbp,%rsi
mov %r14,%rdi
callq 130 <func0+0x130>
test %ebx,%ebx
jle 15d <func0+0x15d>
movslq %ebx,%rcx
lea -0x1(%rbp,%rcx,1),%rax
lea (%r14,%r13,1),%rdx
lea -0x2(%rbp,%rcx,1),%rsi
lea -0x1(%rbx),%ecx
sub %rcx,%rsi
movzbl (%rax),%ecx
mov %cl,(%rdx)
sub $0x1,%rax
add $0x1,%rdx
cmp %rsi,%rax
jne 14b <func0+0x14b>
lea (%rbx,%r12,1),%eax
cltq
movb $0x0,(%r14,%rax,1)
jmpq ca <func0+0xca>
This discrepancy raises concerns about the reliability and effectiveness of the language models trained on such data. Inaccurate representations could potentially undermine the model's ability to generalize and produce meaningful decompiled C functions.