c2goasm
c2goasm copied to clipboard
Took forever to generate the goasm
Hi i am testing the c2goasm, for a simple function it works, but for bigger one it is took like forever, i wonder if there was a problem in my config or a bug?

build command: clang -S -O3 -masm=intel -mno-red-zone -mstackrealign -mllvm -inline-threshold=1000 -fno-asynchronous-unwind-tables -fno-exceptions -fno-rtti $1
Generated CLang ASM
.text
.intel_syntax noprefix
.file "encoder.c"
.globl qoi_write_32 # -- Begin function qoi_write_32
.p2align 4, 0x90
.type qoi_write_32,@function
qoi_write_32: # @qoi_write_32
# %bb.0:
push rbp
mov rbp, rsp
and rsp, -8
mov eax, edx
shr eax, 24
movsxd r8, dword ptr [rsi]
lea ecx, [r8 + 1]
mov dword ptr [rsi], ecx
mov byte ptr [rdi + r8], al
mov eax, edx
shr eax, 16
movsxd r8, dword ptr [rsi]
lea ecx, [r8 + 1]
mov dword ptr [rsi], ecx
mov byte ptr [rdi + r8], al
movsxd rax, dword ptr [rsi]
lea ecx, [rax + 1]
mov dword ptr [rsi], ecx
mov byte ptr [rdi + rax], dh
movsxd rax, dword ptr [rsi]
lea ecx, [rax + 1]
mov dword ptr [rsi], ecx
mov byte ptr [rdi + rax], dl
mov rsp, rbp
pop rbp
ret
.Lfunc_end0:
.size qoi_write_32, .Lfunc_end0-qoi_write_32
# -- End function
.globl qoi_read_32 # -- Begin function qoi_read_32
.p2align 4, 0x90
.type qoi_read_32,@function
qoi_read_32: # @qoi_read_32
# %bb.0:
push rbp
mov rbp, rsp
and rsp, -8
movsxd rcx, dword ptr [rsi]
lea rax, [rcx + 1]
mov dword ptr [rsi], eax
movzx r8d, byte ptr [rdi + rcx]
lea rax, [rcx + 2]
mov dword ptr [rsi], eax
movzx r9d, byte ptr [rdi + rcx + 1]
lea rax, [rcx + 3]
mov dword ptr [rsi], eax
movzx eax, byte ptr [rdi + rcx + 2]
lea edx, [rcx + 4]
mov dword ptr [rsi], edx
movzx ecx, byte ptr [rdi + rcx + 3]
shl r8d, 24
shl r9d, 16
or r9d, r8d
shl eax, 8
or eax, r9d
or eax, ecx
mov rsp, rbp
pop rbp
ret
.Lfunc_end1:
.size qoi_read_32, .Lfunc_end1-qoi_read_32
# -- End function
.globl pixel_cpy # -- Begin function pixel_cpy
.p2align 4, 0x90
.type pixel_cpy,@function
pixel_cpy: # @pixel_cpy
# %bb.0:
push rbp
mov rbp, rsp
and rsp, -8
test rdx, rdx
je .LBB2_20
# %bb.1:
cmp rdx, 32
jb .LBB2_13
# %bb.2:
lea rax, [rsi + rdx]
cmp rax, rdi
jbe .LBB2_4
# %bb.3:
lea rax, [rdi + rdx]
cmp rax, rsi
jbe .LBB2_4
.LBB2_13:
lea r8, [rdx - 1]
mov r9, rdx
and r9, 7
je .LBB2_17
.LBB2_14:
xor ecx, ecx
.p2align 4, 0x90
.LBB2_15: # =>This Inner Loop Header: Depth=1
movzx eax, byte ptr [rsi + rcx]
mov byte ptr [rdi + rcx], al
add rcx, 1
cmp r9, rcx
jne .LBB2_15
# %bb.16:
sub rdx, rcx
add rsi, rcx
add rdi, rcx
.LBB2_17:
cmp r8, 7
jb .LBB2_20
# %bb.18:
xor eax, eax
.p2align 4, 0x90
.LBB2_19: # =>This Inner Loop Header: Depth=1
movzx ecx, byte ptr [rsi + rax]
mov byte ptr [rdi + rax], cl
movzx ecx, byte ptr [rsi + rax + 1]
mov byte ptr [rdi + rax + 1], cl
movzx ecx, byte ptr [rsi + rax + 2]
mov byte ptr [rdi + rax + 2], cl
movzx ecx, byte ptr [rsi + rax + 3]
mov byte ptr [rdi + rax + 3], cl
movzx ecx, byte ptr [rsi + rax + 4]
mov byte ptr [rdi + rax + 4], cl
movzx ecx, byte ptr [rsi + rax + 5]
mov byte ptr [rdi + rax + 5], cl
movzx ecx, byte ptr [rsi + rax + 6]
mov byte ptr [rdi + rax + 6], cl
movzx ecx, byte ptr [rsi + rax + 7]
mov byte ptr [rdi + rax + 7], cl
add rax, 8
cmp rdx, rax
jne .LBB2_19
jmp .LBB2_20
.LBB2_4:
mov r8, rdx
and r8, -32
lea rax, [r8 - 32]
mov rcx, rax
shr rcx, 5
add rcx, 1
mov r9d, ecx
and r9d, 3
cmp rax, 96
jae .LBB2_6
# %bb.5:
xor eax, eax
jmp .LBB2_8
.LBB2_6:
sub rcx, r9
xor eax, eax
.p2align 4, 0x90
.LBB2_7: # =>This Inner Loop Header: Depth=1
movups xmm0, xmmword ptr [rsi + rax]
movups xmm1, xmmword ptr [rsi + rax + 16]
movups xmmword ptr [rdi + rax], xmm0
movups xmmword ptr [rdi + rax + 16], xmm1
movups xmm0, xmmword ptr [rsi + rax + 32]
movups xmm1, xmmword ptr [rsi + rax + 48]
movups xmmword ptr [rdi + rax + 32], xmm0
movups xmmword ptr [rdi + rax + 48], xmm1
movups xmm0, xmmword ptr [rsi + rax + 64]
movups xmm1, xmmword ptr [rsi + rax + 80]
movups xmmword ptr [rdi + rax + 64], xmm0
movups xmmword ptr [rdi + rax + 80], xmm1
movups xmm0, xmmword ptr [rsi + rax + 96]
movups xmm1, xmmword ptr [rsi + rax + 112]
movups xmmword ptr [rdi + rax + 96], xmm0
movups xmmword ptr [rdi + rax + 112], xmm1
sub rax, -128
add rcx, -4
jne .LBB2_7
.LBB2_8:
test r9, r9
je .LBB2_11
# %bb.9:
add rax, 16
neg r9
.p2align 4, 0x90
.LBB2_10: # =>This Inner Loop Header: Depth=1
movups xmm0, xmmword ptr [rsi + rax - 16]
movups xmm1, xmmword ptr [rsi + rax]
movups xmmword ptr [rdi + rax - 16], xmm0
movups xmmword ptr [rdi + rax], xmm1
add rax, 32
inc r9
jne .LBB2_10
.LBB2_11:
cmp r8, rdx
jne .LBB2_12
.LBB2_20:
mov eax, 1
mov rsp, rbp
pop rbp
ret
.LBB2_12:
and edx, 31
add rsi, r8
add rdi, r8
lea r8, [rdx - 1]
mov r9, rdx
and r9, 7
jne .LBB2_14
jmp .LBB2_17
.Lfunc_end2:
.size pixel_cpy, .Lfunc_end2-pixel_cpy
# -- End function
.globl qoi_pixel_encoder # -- Begin function qoi_pixel_encoder
.p2align 4, 0x90
.type qoi_pixel_encoder,@function
qoi_pixel_encoder: # @qoi_pixel_encoder
# %bb.0:
push rbp
mov rbp, rsp
push r15
push r14
push r13
push r12
push rbx
and rsp, -8
mov ebx, dword ptr [rbp + 16]
mov r10, qword ptr [rbp + 24]
mov r11, qword ptr [rbp + 32]
xor eax, eax
cmp r11, r10
sete al
add r9d, -1
xor r9d, ecx
add ebx, -1
xor ebx, r8d
or ebx, r9d
sete cl
mov r9b, byte ptr [r11]
mov r13b, byte ptr [r11 + 1]
mov r14b, byte ptr [r11 + 2]
mov r8b, byte ptr [r11 + 3]
mov ebx, dword ptr [rdx + 4*rax]
cmp r11, r10
jne .LBB3_3
# %bb.1:
test cl, cl
jne .LBB3_3
# %bb.2:
cmp ebx, 8224
jne .LBB3_7
.LBB3_3:
shl rax, 2
cmp ebx, 32
jg .LBB3_5
# %bb.4:
add bl, -1
or bl, 64
mov r12d, 1
mov r15, rsi
jmp .LBB3_6
.LBB3_5:
add ebx, -33
mov dword ptr [rdx + rax], ebx
shr ebx, 8
or bl, 96
lea r15, [rsi + 4]
movsxd rcx, dword ptr [rsi]
mov byte ptr [rdi + rcx], bl
mov bl, byte ptr [rdx + rax]
mov r12d, 2
.LBB3_6:
lea rsi, [rsi + 4*r12]
movsxd rcx, dword ptr [r15]
mov byte ptr [rdi + rcx], bl
mov dword ptr [rdx + rax], 0
.LBB3_7:
cmp r11, r10
je .LBB3_25
# %bb.8:
mov rax, qword ptr [rbp + 40]
xor r13b, r9b
xor r13b, r14b
xor r13b, r8b
movzx ecx, r13b
shl rcx, 5
mov rax, qword ptr [rax + rcx]
cmp rax, r11
je .LBB3_26
# %bb.9:
mov cl, byte ptr [r11]
mov byte ptr [rax], cl
mov cl, byte ptr [r11 + 1]
mov byte ptr [rax + 1], cl
mov cl, byte ptr [r11 + 2]
mov byte ptr [rax + 2], cl
mov cl, byte ptr [r11 + 3]
mov byte ptr [rax + 3], cl
movsx ecx, byte ptr [r11]
movsx eax, byte ptr [r10]
sub ecx, eax
movsx r9d, byte ptr [r11 + 1]
movsx eax, byte ptr [r10 + 1]
sub r9d, eax
movsx edx, byte ptr [r11 + 2]
movsx eax, byte ptr [r10 + 2]
sub edx, eax
movsx r8d, byte ptr [r11 + 3]
movsx eax, byte ptr [r10 + 3]
sub r8d, eax
lea r15d, [rcx + 16]
lea eax, [r9 + 16]
or eax, r15d
lea r14d, [rdx + 16]
lea r10d, [r8 + 16]
mov ebx, r14d
or ebx, r10d
or ebx, eax
cmp ebx, 32
jae .LBB3_16
# %bb.10:
lea r11d, [rdx + 2]
cmp r11d, 3
ja .LBB3_13
# %bb.11:
lea eax, [rcx + 2]
lea ebx, [r9 + 2]
or ebx, eax
and ebx, -4
or ebx, r8d
jne .LBB3_13
# %bb.12:
shl ecx, 4
add ecx, 32
lea eax, [4*r9 + 8]
or eax, ecx
or eax, r11d
or al, -128
movsxd rcx, dword ptr [rsi]
mov byte ptr [rdi + rcx], al
jmp .LBB3_25
.LBB3_26:
movsxd rax, dword ptr [rsi]
mov byte ptr [rdi + rax], r13b
jmp .LBB3_25
.LBB3_16:
test ecx, ecx
setne al
shl al, 3
test r9d, r9d
setne bl
shl bl, 2
or bl, al
test edx, edx
setne al
add al, al
or al, bl
test r8d, r8d
setne bl
or bl, al
or bl, -16
movsxd rax, dword ptr [rsi]
mov byte ptr [rdi + rax], bl
test ecx, ecx
je .LBB3_17
# %bb.18:
mov al, byte ptr [r11]
movsxd rcx, dword ptr [rsi + 4]
add rsi, 8
mov byte ptr [rdi + rcx], al
test r9d, r9d
je .LBB3_21
.LBB3_20:
mov al, byte ptr [r11 + 1]
movsxd rcx, dword ptr [rsi]
add rsi, 4
mov byte ptr [rdi + rcx], al
.LBB3_21:
test edx, edx
je .LBB3_23
# %bb.22:
mov al, byte ptr [r11 + 2]
movsxd rcx, dword ptr [rsi]
add rsi, 4
mov byte ptr [rdi + rcx], al
.LBB3_23:
test r8d, r8d
je .LBB3_25
# %bb.24:
mov al, byte ptr [r11 + 3]
movsxd rcx, dword ptr [rsi]
mov byte ptr [rdi + rcx], al
jmp .LBB3_25
.LBB3_13:
lea eax, [r9 + 8]
add edx, 8
or eax, edx
and eax, -16
or eax, r8d
je .LBB3_14
# %bb.15:
mov eax, r15d
shr al
or al, -32
movsxd rcx, dword ptr [rsi]
mov byte ptr [rdi + rcx], al
shl r15d, 7
lea eax, [4*r9 + 64]
or eax, r15d
mov ecx, r14d
shr ecx, 3
or ecx, eax
movsxd rax, dword ptr [rsi + 4]
mov byte ptr [rdi + rax], cl
shl r14d, 7
or r10d, r14d
movsxd rax, dword ptr [rsi + 8]
mov byte ptr [rdi + rax], r10b
jmp .LBB3_25
.LBB3_17:
add rsi, 4
test r9d, r9d
jne .LBB3_20
jmp .LBB3_21
.LBB3_14:
or r15b, -64
movsxd rax, dword ptr [rsi]
mov byte ptr [rdi + rax], r15b
shl r9d, 4
sub r9d, -128
or edx, r9d
movsxd rax, dword ptr [rsi + 4]
mov byte ptr [rdi + rax], dl
.LBB3_25:
mov eax, 1
lea rsp, [rbp - 40]
pop rbx
pop r12
pop r13
pop r14
pop r15
pop rbp
ret
.Lfunc_end3:
.size qoi_pixel_encoder, .Lfunc_end3-qoi_pixel_encoder
# -- End function
.ident "clang version 10.0.0-4ubuntu1 "
.section ".note.GNU-stack","",@progbits
.addrsig
C code
#ifndef QOI_ENCODER_
#define QOI_ENCODER_
#include <stddef.h>
#include "qoi.h"
void pixel_cpy(char *dst, char *src, size_t sz)
{
while (sz--)
{
*dst++ = *src++;
}
}
int qoi_pixel_encoder(
char *data, int *cur, int *run,
const int x, const int y,
const int maxX, const int maxY,
const char *px_prev, char *px,
char **index) // [64][4]
{
qoi_rgba_t px_ = {.rgba = {
.r = px[0],
.g = px[1],
.b = px[2],
.a = px[3],
}};
if (px == px_prev)
{
*run++;
}
int last_pixel = x == maxX - 1 && y == (maxY - 1);
if (*run > 0 && *run == 0x2020 || px != px_prev || last_pixel)
{
if (*run < 33)
{
*(data + *cur++) = QOI_RUN_8 | *run - 1;
}
else
{
*run -= 33;
*(data + *cur++) = QOI_RUN_16 | *run >> 8;
*(data + *cur++) = *run & 0xFF;
}
*run = 0;
}
if (px != px_prev)
{
int index_pos = QOI_COLOR_HASH(px_);
if (index[index_pos * 4] == px)
{
*(data + *cur++) = QOI_INDEX | index_pos;
}
else
{
pixel_cpy(index[index_pos * 4], px, 4);
int vr = px[0] - px_prev[0];
int vg = px[1] - px_prev[1];
int vb = px[2] - px_prev[2];
int va = px[3] - px_prev[3];
if (
vr > -17 && vr < 16 &&
vg > -17 && vg < 16 &&
vb > -17 && vb < 16 &&
va > -17 && va < 16)
{
if (
va == 0 &&
vr > -3 && vr < 2 &&
vg > -3 && vg < 2 &&
vb > -3 && vb < 2)
{
*(data + *cur++) = QOI_DIFF_8 | (vr + 2) << 4 | (vg + 2) << 2 | (vb + 2);
}
else if (
va == 0 &&
vr > -17 && vr < 16 &&
vg > -9 && vg < 8 &&
vb > -9 && vb < 8)
{
*(data + *cur++) = QOI_DIFF_16 | (vr + 16);
*(data + *cur++) = (vg + 8) << 4 | (vb + 8);
}
else
{
*(data + *cur++) = QOI_DIFF_24 | (vr + 16) >> 1;
*(data + *cur++) = (vr + 16) << 7 | (vg + 16) << 2 | (vb + 16) >> 3;
*(data + *cur++) = (vb + 16) << 7 | (va + 16);
}
}
else
{
*(data + *cur++) = QOI_COLOR | (vr ? 8 : 0) | (vg ? 4 : 0) | (vb ? 2 : 0) | (va ? 1 : 0);
if (vr)
{
*(data + *cur++) = px[0];
}
if (vg)
{
*(data + *cur++) = px[1];
}
if (vb)
{
*(data + *cur++) = px[2];
}
if (va)
{
*(data + *cur++) = px[3];
}
}
}
px_prev = px;
}
return 1;
}
#endif
I can reproduce it by using this code:
int TesByteSlice(char *t, size_t sz)
{
for (int i = 0; i < sz; i++)
{
*t++ = (char)i;
}
return 1;
}
or
int TesByteSlice(char *t, size_t sz)
{
int i = 0;
loop:
*t++ = i++;
if (i < sz)
goto loop;
return 1;
}
Generated CLang ASM
.text
.intel_syntax noprefix
.file "tes.c"
.globl TesSum # -- Begin function TesSum
.p2align 4, 0x90
.type TesSum, @function
TesSum:
# @TesSum
# %bb.0:
push rbp
mov rbp, rsp
and rsp, -8
# kill: def $esi killed $esi def $rsi
# kill: def $edi killed $edi def $rdi
lea eax, [rdi + rsi]
mov rsp, rbp
pop rbp
ret
.Lfunc_end0:
.size TesSum, .Lfunc_end0-TesSum
# -- End function
.section .rodata.cst16, "aM", @progbits, 16
.p2align 4 # -- Begin function TesByteSlice
.LCPI1_0:
.byte 0 # 0x0
.byte 1 # 0x1
.byte 2 # 0x2
.byte 3 # 0x3
.byte 4 # 0x4
.byte 5 # 0x5
.byte 6 # 0x6
.byte 7 # 0x7
.byte 8 # 0x8
.byte 9 # 0x9
.byte 10 # 0xa
.byte 11 # 0xb
.byte 12 # 0xc
.byte 13 # 0xd
.byte 14 # 0xe
.byte 15 # 0xf
.LCPI1_1:
.zero 16, 16
.LCPI1_2:
.zero 16, 32
.LCPI1_3:
.zero 16, 48
.LCPI1_4:
.zero 16, 64
.LCPI1_5:
.zero 16, 80
.LCPI1_6:
.zero 16, 96
.LCPI1_7:
.zero 16, 112
.LCPI1_8:
.zero 16, 128
.text
.globl TesByteSlice
.p2align 4, 0x90
.type TesByteSlice, @function
TesByteSlice:
# @TesByteSlice
# %bb.0:
push rbp
mov rbp, rsp
and rsp, -8
test rsi, rsi
je .LBB1_13
# %bb.1:
cmp rsi, 15
ja .LBB1_3
# %bb.2:
xor eax, eax
jmp .LBB1_12
.LBB1_3:
mov rax, rsi
and rax, -16
lea rdx, [rax - 16]
mov rcx, rdx
shr rcx, 4
add rcx, 1
mov r8d, ecx
and r8d, 7
cmp rdx, 112
jae .LBB1_5
# %bb.4:
movdqa xmm2, xmmword ptr [rip + .LCPI1_0] # xmm2 = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]
xor edx, edx
jmp .LBB1_7
.LBB1_5:
sub rcx, r8
movdqa xmm2, xmmword ptr [rip + .LCPI1_0] # xmm2 = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]
xor edx, edx
movdqa xmm8, xmmword ptr [rip + .LCPI1_1] # xmm8 = [16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16]
movdqa xmm9, xmmword ptr [rip + .LCPI1_2] # xmm9 = [32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32]
movdqa xmm3, xmmword ptr [rip + .LCPI1_3] # xmm3 = [48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48]
movdqa xmm4, xmmword ptr [rip + .LCPI1_4] # xmm4 = [64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64]
movdqa xmm5, xmmword ptr [rip + .LCPI1_5] # xmm5 = [80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80]
movdqa xmm6, xmmword ptr [rip + .LCPI1_6] # xmm6 = [96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96]
movdqa xmm7, xmmword ptr [rip + .LCPI1_7] # xmm7 = [112, 112, 112, 112, 112, 112, 112, 112, 112, 112, 112, 112, 112, 112, 112, 112]
movdqa xmm1, xmmword ptr [rip + .LCPI1_8] # xmm1 = [128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128]
.p2align 4, 0x90
.LBB1_6: # =>This Inner Loop Header: Depth=1
movdqu xmmword ptr [rdi + rdx], xmm2
movdqa xmm0, xmm2
paddb xmm0, xmm8
movdqu xmmword ptr [rdi + rdx + 16], xmm0
movdqa xmm0, xmm2
paddb xmm0, xmm9
movdqu xmmword ptr [rdi + rdx + 32], xmm0
movdqa xmm0, xmm2
paddb xmm0, xmm3
movdqu xmmword ptr [rdi + rdx + 48], xmm0
movdqa xmm0, xmm2
paddb xmm0, xmm4
movdqu xmmword ptr [rdi + rdx + 64], xmm0
movdqa xmm0, xmm2
paddb xmm0, xmm5
movdqu xmmword ptr [rdi + rdx + 80], xmm0
movdqa xmm0, xmm2
paddb xmm0, xmm6
movdqu xmmword ptr [rdi + rdx + 96], xmm0
movdqa xmm0, xmm2
paddb xmm0, xmm7
movdqu xmmword ptr [rdi + rdx + 112], xmm0
sub rdx, -128
pxor xmm2, xmm1
add rcx, -8
jne .LBB1_6
.LBB1_7:
test r8, r8
je .LBB1_10
# %bb.8:
add rdx, rdi
neg r8
movdqa xmm1, xmmword ptr [rip + .LCPI1_1] # xmm1 = [16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16]
.p2align 4, 0x90
.LBB1_9: # =>This Inner Loop Header: Depth=1
movdqu xmmword ptr [rdx], xmm2
paddb xmm2, xmm1
add rdx, 16
inc r8
jne .LBB1_9
.LBB1_10:
cmp rax, rsi
je .LBB1_13
# %bb.11:
add rdi, rax
.p2align 4, 0x90
.LBB1_12: # =>This Inner Loop Header: Depth=1
mov byte ptr [rdi], al
add rdi, 1
add rax, 1
cmp rsi, rax
jne .LBB1_12
.LBB1_13:
mov eax, 1
mov rsp, rbp
pop rbp
ret
.Lfunc_end1:
.size TesByteSlice, .Lfunc_end1-TesByteSlice
# -- End function
.ident "clang version 10.0.0-4ubuntu1 "
.section ".note.GNU-stack", "", @progbits
.addrsig
Recursive function works, but let's ignore it i guess
edit: tried with byte slices, doesn't work :/
int TesByteSlice(char *t, size_t sz)
{
if (sz == 0)
return 1;
*t++ = sz--;
return TesByteSlice(t, sz);
}