I saw that u8::carrying_add has been stabilized, so I looked at the codegen when manually implementing a u128 add. It seems very bad. Am I using it wrong?
A godbolt link comparing some different attempts is here
None manage to replicate the assembly of u128::add, but the naive version is a lot closer than the u8::carrying_add version.
copy of code from godbolt link
#[unsafe(no_mangle)]
fn a(a: u8, b: u8, carry: bool) -> (u8,bool) {
u8::carrying_add(a,b,carry)
}
#[unsafe(no_mangle)]
fn b(a: u8, b: u8, carry: bool) -> (u8,bool) {
adc(a,b,carry)
}
#[unsafe(no_mangle)]
pub fn myadd(num1: u128, num2: u128) -> u128 {
num1 + num2
}
// add with carry
fn adc(num1: u8, num2: u8, carry: bool) -> (u8, bool) {
let carry = num1 as u16 + num2 as u16 + carry as u16 > u8::MAX as u16;
let sum = num1.wrapping_add(num2);
(sum, carry)
}
#[unsafe(no_mangle)]
pub fn myadd1(num: [u8;16], num2: [u8;16]) -> [u8;16] {
let mut carry = false;
let mut sum = [0u8;16];
for i in 0..num.len() {
let (sum_v, carry_v) = adc(num[i], num2[i], carry);
sum[i] = sum_v;
carry = carry_v;
}
sum
}
#[repr(C)]
struct AlignAs<A,T>(T,[A;0]);
#[unsafe(no_mangle)]
pub fn myadd2(num: AlignAs<u128, [u8;16]>, num2: AlignAs<u128, [u8;16]>) -> AlignAs<u128, [u8;16]> {
let mut carry = false;
let mut sum = AlignAs([0u8;16],[]);
for i in 0..num.0.len() {
let (sum_v, carry_v) = adc(num.0[i], num2.0[i], carry);
sum.0[i] = sum_v;
carry = carry_v;
}
sum
}
#[unsafe(no_mangle)]
pub fn myadd3(num: [u8;16], num2: [u8;16]) -> [u8;16] {
let mut carry = false;
let mut sum = [0u8;16];
for i in 0..num.len() {
let (sum_v, carry_v) = u8::carrying_add(num[i], num2[i], carry);
sum[i] = sum_v;
carry = carry_v;
}
sum
}
copy of asm
a:
mov eax, esi
add al, dil
setb cl
add al, dl
setb dl
or dl, cl
ret
b:
movzx ecx, dil
movzx eax, sil
add edx, eax
add edx, ecx
cmp edx, 256
setae dl
add al, cl
ret
myadd:
mov rax, rdi
add rax, rdx
adc rsi, rcx
mov rdx, rsi
ret
myadd1:
mov rax, rdi
movdqu xmm0, xmmword ptr [rsi]
movdqu xmm1, xmmword ptr [rdx]
paddb xmm1, xmm0
movdqu xmmword ptr [rdi], xmm1
ret
myadd2:
mov rax, rdi
movdqa xmm0, xmmword ptr [rdx]
paddb xmm0, xmmword ptr [rsi]
movdqa xmmword ptr [rdi], xmm0
ret
myadd3:
push rbp
push r14
push rbx
mov rax, rdi
movq xmm0, qword ptr [rsi]
movq xmm1, qword ptr [rdx]
paddb xmm1, xmm0
movdqa xmmword ptr [rsp - 16], xmm1
movzx ecx, byte ptr [rsp - 16]
mov byte ptr [rdi], cl
pmaxub xmm0, xmm1
pcmpeqb xmm0, xmm1
pmovmskb r10d, xmm0
not r10d
mov ecx, r10d
shr cl, 7
mov edi, r10d
and dil, 64
shr dil, 6
mov r8d, r10d
and r8b, 32
shr r8b, 5
mov r9d, r10d
and r9b, 16
shr r9b, 4
mov r11d, r10d
and r11b, 8
shr r11b, 3
mov ebx, r10d
and bl, 4
shr bl, 2
mov ebp, r10d
and bpl, 2
shr bpl
and r10b, 1
add r10b, byte ptr [rsp - 15]
setb r14b
or r14b, bpl
mov byte ptr [rax + 1], r10b
add r14b, byte ptr [rsp - 14]
setb r10b
or r10b, bl
mov byte ptr [rax + 2], r14b
add r10b, byte ptr [rsp - 13]
setb bl
or bl, r11b
mov byte ptr [rax + 3], r10b
add bl, byte ptr [rsp - 12]
setb r10b
or r10b, r9b
mov byte ptr [rax + 4], bl
add r10b, byte ptr [rsp - 11]
setb r9b
or r9b, r8b
mov byte ptr [rax + 5], r10b
add r9b, byte ptr [rsp - 10]
setb r8b
or r8b, dil
mov byte ptr [rax + 6], r9b
add r8b, byte ptr [rsp - 9]
setb dil
or dil, cl
mov byte ptr [rax + 7], r8b
movzx ecx, byte ptr [rdx + 8]
add dil, -1
adc cl, byte ptr [rsi + 8]
mov byte ptr [rax + 8], cl
movzx ecx, byte ptr [rdx + 9]
adc cl, byte ptr [rsi + 9]
movzx edi, byte ptr [rdx + 10]
adc dil, byte ptr [rsi + 10]
movzx r8d, byte ptr [rdx + 11]
adc r8b, byte ptr [rsi + 11]
mov byte ptr [rax + 9], cl
mov byte ptr [rax + 10], dil
mov byte ptr [rax + 11], r8b
movzx ecx, byte ptr [rdx + 12]
adc cl, byte ptr [rsi + 12]
mov byte ptr [rax + 12], cl
movzx ecx, byte ptr [rdx + 13]
adc cl, byte ptr [rsi + 13]
movzx edi, byte ptr [rdx + 14]
adc dil, byte ptr [rsi + 14]
movzx edx, byte ptr [rdx + 15]
adc dl, byte ptr [rsi + 15]
mov byte ptr [rax + 13], cl
mov byte ptr [rax + 14], dil
mov byte ptr [rax + 15], dl
pop rbx
pop r14
pop rbp
ret