ip: improve csum fold on x86_64
New code seems to be 1.5 clocks faster.
old:
mov eax,edi
shr rdi,0x20
add rdi,rax
movzx edx,di
shr rdi,0x10
add rdx,rdi
movzx eax,dx
shr rdx,0x10
add rax,rdx
mov rdx,rax
shr rdx,0x10
add eax,edx
new:
mov rax,rdi
shr rax,0x20
add eax,edi
mov edi,0x10
shrx edi,eax,edi
adc ax,di
adc ax,0x0
Type: improvement
Change-Id: I3c565812c67ff4c3db197a9d4137a6c131b5b66c
Signed-off-by: Damjan Marion <damarion@cisco.com>
diff --git a/src/vnet/ip/ip_packet.h b/src/vnet/ip/ip_packet.h
index b0b5f41..d862caa 100644
--- a/src/vnet/ip/ip_packet.h
+++ b/src/vnet/ip/ip_packet.h
@@ -301,6 +301,20 @@
ip_csum_fold (ip_csum_t c)
{
/* Reduce to 16 bits. */
+#ifdef __x86_64__
+ u64 tmp;
+ asm volatile(
+ /* using ADC is much faster than mov, shift, add sequence
+ * compiler produces */
+ "mov %k[sum], %k[tmp] \n\t"
+ "shr $32, %[sum] \n\t"
+ "add %k[tmp], %k[sum] \n\t"
+ "mov $16, %k[tmp] \n\t"
+ "shrx %k[tmp], %k[sum], %k[tmp] \n\t"
+ "adc %w[tmp], %w[sum] \n\t"
+ "adc $0, %w[sum] \n\t"
+ : [ sum ] "+&r"(c), [ tmp ] "=&r"(tmp));
+#else
#if uword_bits == 64
c = (c & (ip_csum_t) 0xffffffff) + (c >> (ip_csum_t) 32);
c = (c & 0xffff) + (c >> 16);
@@ -308,7 +322,7 @@
c = (c & 0xffff) + (c >> 16);
c = (c & 0xffff) + (c >> 16);
-
+#endif
return c;
}