Simplify adjacency rewrite code

Using memcpy instead of complex specific copy logic. This simplify
the implementation and also improve perf slightly.
Also move adjacency data from tail to head of buffer, which improves
cache locality (header and data share the same cacheline)
Finally, fix VxLAN which used to workaround vnet_rewrite logic.

Change-Id: I770ddad9846f7ee505aa99ad417e6a61d5cbbefa
Signed-off-by: Benoît Ganne <bganne@cisco.com>
diff --git a/src/vnet/adj/rewrite.c b/src/vnet/adj/rewrite.c
index 975dc4a..95671b0 100644
--- a/src/vnet/adj/rewrite.c
+++ b/src/vnet/adj/rewrite.c
@@ -40,30 +40,6 @@
 #include <vnet/vnet.h>
 #include <vnet/ip/lookup.h>
 
-void
-vnet_rewrite_copy_slow_path (vnet_rewrite_data_t * p0,
-			     vnet_rewrite_data_t * rw0,
-			     word n_left, uword most_likely_size)
-{
-  uword n_done =
-    round_pow2 (most_likely_size, sizeof (rw0[0])) / sizeof (rw0[0]);
-
-  p0 -= n_done;
-  rw0 -= n_done;
-
-  /* As we enter the cleanup loop, p0 and rw0 point to the last chunk written
-     by the fast path. Hence, the constant 1, which the
-     vnet_rewrite_copy_one macro renders as p0[-1] = rw0[-1]. */
-
-  while (n_left > 0)
-    {
-      vnet_rewrite_copy_one (p0, rw0, 1);
-      p0--;
-      rw0--;
-      n_left--;
-    }
-}
-
 u8 *
 format_vnet_rewrite (u8 * s, va_list * args)
 {
@@ -72,6 +48,8 @@
   CLIB_UNUSED (u32 indent) = va_arg (*args, u32);
   vnet_main_t *vnm = vnet_get_main ();
 
+  ASSERT (rw->data_bytes <= max_data_bytes);
+
   if (rw->sw_if_index != ~0)
     {
       vnet_sw_interface_t *si;
@@ -86,9 +64,7 @@
 
   /* Format rewrite string. */
   if (rw->data_bytes > 0)
-    s = format (s, " %U",
-		format_hex_bytes,
-		rw->data + max_data_bytes - rw->data_bytes, rw->data_bytes);
+    s = format (s, " %U", format_hex_bytes, rw->data, rw->data_bytes);
 
   return s;
 }
diff --git a/src/vnet/adj/rewrite.h b/src/vnet/adj/rewrite.h
index 3278113..05a722f 100644
--- a/src/vnet/adj/rewrite.h
+++ b/src/vnet/adj/rewrite.h
@@ -131,8 +131,8 @@
   ASSERT ((data_bytes >= 0) && (data_bytes < max_size));
 
   rw->data_bytes = data_bytes;
-  clib_memcpy_fast (rw->data + max_size - data_bytes, data, data_bytes);
-  clib_memset (rw->data, 0xfe, max_size - data_bytes);
+  clib_memcpy_fast (rw->data, data, data_bytes);
+  clib_memset (rw->data + data_bytes, 0xfe, max_size - data_bytes);
 }
 
 #define vnet_rewrite_set_data(rw,data,data_bytes)		\
@@ -145,76 +145,28 @@
 vnet_rewrite_get_data_internal (vnet_rewrite_header_t * rw, int max_size)
 {
   ASSERT (rw->data_bytes <= max_size);
-  return rw->data + max_size - rw->data_bytes;
+  return rw->data;
 }
 
 #define vnet_rewrite_get_data(rw) \
   vnet_rewrite_get_data_internal (&((rw).rewrite_header), sizeof ((rw).rewrite_data))
 
 always_inline void
-vnet_rewrite_copy_one (vnet_rewrite_data_t * p0, vnet_rewrite_data_t * rw0,
-		       int i)
-{
-  p0[-i] = rw0[-i];
-}
-
-void vnet_rewrite_copy_slow_path (vnet_rewrite_data_t * p0,
-				  vnet_rewrite_data_t * rw0,
-				  word n_left, uword most_likely_size);
-
-/* *INDENT-OFF* */
-typedef CLIB_PACKED (struct {
-  u64 a;
-  u32 b;
-  u16 c;
-}) eh_copy_t;
-/* *INDENT-ON* */
-
-always_inline void
 _vnet_rewrite_one_header (vnet_rewrite_header_t * h0,
 			  void *packet0, int max_size, int most_likely_size)
 {
-  vnet_rewrite_data_t *p0 = packet0;
-  vnet_rewrite_data_t *rw0 = (vnet_rewrite_data_t *) (h0->data + max_size);
-  word n_left0;
-
   /* 0xfefe => poisoned adjacency => crash */
   ASSERT (h0->data_bytes != 0xfefe);
-
-  if (PREDICT_TRUE (h0->data_bytes == sizeof (eh_copy_t)))
+  if (PREDICT_TRUE (most_likely_size == h0->data_bytes))
     {
-      eh_copy_t *s, *d;
-      s = (eh_copy_t *) (h0->data + max_size - sizeof (eh_copy_t));
-      d = (eh_copy_t *) (((u8 *) packet0) - sizeof (eh_copy_t));
-      clib_memcpy (d, s, sizeof (eh_copy_t));
-      return;
+      clib_memcpy_fast ((u8 *) packet0 - most_likely_size,
+			h0->data, most_likely_size);
     }
-  /*
-   * Stop now if the data_bytes field is zero, to avoid the cache
-   * miss consequences of spraying [functionally harmless] junk into
-   * un-prefetched rewrite space.
-   */
-  if (PREDICT_FALSE (h0->data_bytes == 0))
-    return;
-
-#define _(i)								\
-  do {									\
-    if (most_likely_size > ((i)-1)*sizeof (vnet_rewrite_data_t))	\
-      vnet_rewrite_copy_one (p0, rw0, (i));				\
-  } while (0)
-
-  _(4);
-  _(3);
-  _(2);
-  _(1);
-
-#undef _
-
-  n_left0 = (int)
-    (((int) h0->data_bytes - most_likely_size) + (sizeof (rw0[0]) - 1))
-    / (int) sizeof (rw0[0]);
-  if (PREDICT_FALSE (n_left0 > 0))
-    vnet_rewrite_copy_slow_path (p0, rw0, n_left0, most_likely_size);
+  else
+    {
+      clib_memcpy_fast ((u8 *) packet0 - h0->data_bytes,
+			h0->data, h0->data_bytes);
+    }
 }
 
 always_inline void
@@ -223,68 +175,24 @@
 			   void *packet0,
 			   void *packet1, int max_size, int most_likely_size)
 {
-  vnet_rewrite_data_t *p0 = packet0;
-  vnet_rewrite_data_t *p1 = packet1;
-  vnet_rewrite_data_t *rw0 = (vnet_rewrite_data_t *) (h0->data + max_size);
-  vnet_rewrite_data_t *rw1 = (vnet_rewrite_data_t *) (h1->data + max_size);
-  word n_left0, n_left1;
-  int slow_path;
-
   /* 0xfefe => poisoned adjacency => crash */
   ASSERT (h0->data_bytes != 0xfefe);
   ASSERT (h1->data_bytes != 0xfefe);
-
-  /* Arithmetic calculation: bytes0 == bytes1 == 14 */
-  slow_path = h0->data_bytes ^ h1->data_bytes;
-  slow_path += h0->data_bytes ^ sizeof (eh_copy_t);
-
-  if (PREDICT_TRUE (slow_path == 0))
+  if (PREDICT_TRUE
+      (most_likely_size == h0->data_bytes
+       && most_likely_size == h1->data_bytes))
     {
-      eh_copy_t *s0, *d0, *s1, *d1;
-      s0 = (eh_copy_t *) (h0->data + max_size - sizeof (eh_copy_t));
-      d0 = (eh_copy_t *) (((u8 *) packet0) - sizeof (eh_copy_t));
-      clib_memcpy (d0, s0, sizeof (eh_copy_t));
-      s1 = (eh_copy_t *) (h1->data + max_size - sizeof (eh_copy_t));
-      d1 = (eh_copy_t *) (((u8 *) packet1) - sizeof (eh_copy_t));
-      clib_memcpy (d1, s1, sizeof (eh_copy_t));
-      return;
+      clib_memcpy_fast ((u8 *) packet0 - most_likely_size,
+			h0->data, most_likely_size);
+      clib_memcpy_fast ((u8 *) packet1 - most_likely_size,
+			h1->data, most_likely_size);
     }
-
-  /*
-   * Stop now if both rewrite data_bytes fields are zero, to avoid the cache
-   * miss consequences of spraying [functionally harmless] junk into
-   * un-prefetched rewrite space.
-   */
-  if (PREDICT_FALSE (h0->data_bytes + h1->data_bytes == 0))
-    return;
-
-#define _(i)								\
-  do {									\
-    if (most_likely_size > ((i)-1)*sizeof (vnet_rewrite_data_t))	\
-      {									\
-	vnet_rewrite_copy_one (p0, rw0, (i));				\
-	vnet_rewrite_copy_one (p1, rw1, (i));				\
-      }									\
-  } while (0)
-
-  _(4);
-  _(3);
-  _(2);
-  _(1);
-
-#undef _
-
-  n_left0 = (int)
-    (((int) h0->data_bytes - most_likely_size) + (sizeof (rw0[0]) - 1))
-    / (int) sizeof (rw0[0]);
-  n_left1 = (int)
-    (((int) h1->data_bytes - most_likely_size) + (sizeof (rw1[0]) - 1))
-    / (int) sizeof (rw1[0]);
-
-  if (PREDICT_FALSE (n_left0 > 0 || n_left1 > 0))
+  else
     {
-      vnet_rewrite_copy_slow_path (p0, rw0, n_left0, most_likely_size);
-      vnet_rewrite_copy_slow_path (p1, rw1, n_left1, most_likely_size);
+      clib_memcpy_fast ((u8 *) packet0 - h0->data_bytes,
+			h0->data, h0->data_bytes);
+      clib_memcpy_fast ((u8 *) packet1 - h1->data_bytes,
+			h1->data, h1->data_bytes);
     }
 }
 
diff --git a/src/vnet/vxlan-gbp/encap.c b/src/vnet/vxlan-gbp/encap.c
index 2fe3fa8..b687cbf 100644
--- a/src/vnet/vxlan-gbp/encap.c
+++ b/src/vnet/vxlan-gbp/encap.c
@@ -97,7 +97,6 @@
 
   u8 const underlay_hdr_len = is_ip4 ?
     sizeof (ip4_vxlan_gbp_header_t) : sizeof (ip6_vxlan_gbp_header_t);
-  u8 const rw_hdr_offset = sizeof t0->rewrite_data - underlay_hdr_len;
   u16 const l3_len = is_ip4 ? sizeof (ip4_header_t) : sizeof (ip6_header_t);
   u32 const csum_flags = is_ip4 ?
     VNET_BUFFER_F_OFFLOAD_IP_CKSUM | VNET_BUFFER_F_IS_IP4 |
@@ -176,6 +175,9 @@
 
 	  ASSERT (t0->rewrite_header.data_bytes == underlay_hdr_len);
 	  ASSERT (t1->rewrite_header.data_bytes == underlay_hdr_len);
+	  vnet_rewrite_two_headers (*t0, *t1, vlib_buffer_get_current (b0),
+				    vlib_buffer_get_current (b1),
+				    underlay_hdr_len);
 
 	  vlib_buffer_advance (b0, -underlay_hdr_len);
 	  vlib_buffer_advance (b1, -underlay_hdr_len);
@@ -188,16 +190,6 @@
 	  void *underlay0 = vlib_buffer_get_current (b0);
 	  void *underlay1 = vlib_buffer_get_current (b1);
 
-	  /* vnet_rewrite_two_header writes only in (uword) 8 bytes chunks
-	   * and discards the first 4 bytes of the (36 bytes ip4 underlay)  rewrite
-	   * use memcpy as a workaround */
-	  clib_memcpy_fast (underlay0,
-			    t0->rewrite_header.data + rw_hdr_offset,
-			    underlay_hdr_len);
-	  clib_memcpy_fast (underlay1,
-			    t1->rewrite_header.data + rw_hdr_offset,
-			    underlay_hdr_len);
-
 	  ip4_header_t *ip4_0, *ip4_1;
 	  qos_bits_t ip4_0_tos = 0, ip4_1_tos = 0;
 	  ip6_header_t *ip6_0, *ip6_1;
@@ -370,17 +362,12 @@
 	  vnet_buffer (b0)->ip.adj_index[VLIB_TX] = dpoi_idx0;
 
 	  ASSERT (t0->rewrite_header.data_bytes == underlay_hdr_len);
+	  vnet_rewrite_one_header (*t0, vlib_buffer_get_current (b0),
+				   underlay_hdr_len);
 
 	  vlib_buffer_advance (b0, -underlay_hdr_len);
 	  void *underlay0 = vlib_buffer_get_current (b0);
 
-	  /* vnet_rewrite_one_header writes only in (uword) 8 bytes chunks
-	   * and discards the first 4 bytes of the (36 bytes ip4 underlay)  rewrite
-	   * use memcpy as a workaround */
-	  clib_memcpy_fast (underlay0,
-			    t0->rewrite_header.data + rw_hdr_offset,
-			    underlay_hdr_len);
-
 	  u32 len0 = vlib_buffer_length_in_chain (vm, b0);
 	  u16 payload_l0 = clib_host_to_net_u16 (len0 - l3_len);
 
diff --git a/src/vnet/vxlan/encap.c b/src/vnet/vxlan/encap.c
index fe2374c..b797624 100644
--- a/src/vnet/vxlan/encap.c
+++ b/src/vnet/vxlan/encap.c
@@ -92,7 +92,6 @@
 
   u8 const underlay_hdr_len = is_ip4 ?
     sizeof(ip4_vxlan_header_t) : sizeof(ip6_vxlan_header_t);
-  u8 const rw_hdr_offset = sizeof t0->rewrite_data - underlay_hdr_len;
   u16 const l3_len = is_ip4 ? sizeof(ip4_header_t) : sizeof(ip6_header_t);
   u32 const csum_flags = is_ip4 ?
     VNET_BUFFER_F_OFFLOAD_IP_CKSUM | VNET_BUFFER_F_IS_IP4 |
@@ -173,6 +172,7 @@
 
           ASSERT(t0->rewrite_header.data_bytes == underlay_hdr_len);
           ASSERT(t1->rewrite_header.data_bytes == underlay_hdr_len);
+          vnet_rewrite_two_headers(*t0, *t1, vlib_buffer_get_current(b0), vlib_buffer_get_current(b1), underlay_hdr_len);
 
           vlib_buffer_advance (b0, -underlay_hdr_len);
           vlib_buffer_advance (b1, -underlay_hdr_len);
@@ -185,12 +185,6 @@
           void * underlay0 = vlib_buffer_get_current(b0);
           void * underlay1 = vlib_buffer_get_current(b1);
 
-	  /* vnet_rewrite_two_header writes only in (uword) 8 bytes chunks
-           * and discards the first 4 bytes of the (36 bytes ip4 underlay)  rewrite
-           * use memcpy as a workaround */
-          clib_memcpy_fast(underlay0, t0->rewrite_header.data + rw_hdr_offset, underlay_hdr_len);
-          clib_memcpy_fast(underlay1, t1->rewrite_header.data + rw_hdr_offset, underlay_hdr_len);
-
           ip4_header_t * ip4_0, * ip4_1;
 	  qos_bits_t ip4_0_tos = 0, ip4_1_tos = 0;
           ip6_header_t * ip6_0, * ip6_1;
@@ -354,15 +348,11 @@
 	  vnet_buffer(b0)->ip.adj_index[VLIB_TX] = dpoi_idx0;
 
           ASSERT(t0->rewrite_header.data_bytes == underlay_hdr_len);
+          vnet_rewrite_one_header(*t0, vlib_buffer_get_current(b0), underlay_hdr_len);
 
           vlib_buffer_advance (b0, -underlay_hdr_len);
           void * underlay0 = vlib_buffer_get_current(b0);
 
-	  /* vnet_rewrite_one_header writes only in (uword) 8 bytes chunks
-           * and discards the first 4 bytes of the (36 bytes ip4 underlay)  rewrite
-           * use memcpy as a workaround */
-          clib_memcpy_fast(underlay0, t0->rewrite_header.data + rw_hdr_offset, underlay_hdr_len);
-
  	  u32 len0 = vlib_buffer_length_in_chain (vm, b0);
           u16 payload_l0 = clib_host_to_net_u16 (len0 - l3_len);