ip: Fragmentation fixes

Type: fix

if the packet is about to be fragmented, then don't call any of the
actions that expect the rewrite to have been written.
1) don't double count packets thru the adjacency (original & fragments)
2) don't double decrement the TTL for fragments
3) return to ip4-midchain post ip-frag if that's where we started.
4) only run midchain/mcast fixups if not fragmenting (if no errors)

Change-Id: Ib2866787a42713ee5871b87b597d8f74b901044b
Signed-off-by: Neale Ranns <nranns@cisco.com>
diff --git a/src/vnet/interface_output.h b/src/vnet/interface_output.h
index 58f7f61..f1fa4d8 100644
--- a/src/vnet/interface_output.h
+++ b/src/vnet/interface_output.h
@@ -62,7 +62,6 @@
 
   if (is_ip4)
     {
-      ip4 = (ip4_header_t *) (b->data + vnet_buffer (b)->l3_hdr_offset);
       if (b->flags & VNET_BUFFER_F_OFFLOAD_IP_CKSUM)
 	ip4->checksum = ip4_header_checksum (ip4);
       if (b->flags & VNET_BUFFER_F_OFFLOAD_TCP_CKSUM)
@@ -71,7 +70,10 @@
 	  th->checksum = ip4_tcp_udp_compute_checksum (vm, b, ip4);
 	}
       if (b->flags & VNET_BUFFER_F_OFFLOAD_UDP_CKSUM)
-	uh->checksum = ip4_tcp_udp_compute_checksum (vm, b, ip4);
+	{
+	  uh->checksum = 0;
+	  uh->checksum = ip4_tcp_udp_compute_checksum (vm, b, ip4);
+	}
     }
   if (is_ip6)
     {
diff --git a/src/vnet/ip/ip4_forward.c b/src/vnet/ip/ip4_forward.c
index 40c396c..1550b31 100644
--- a/src/vnet/ip/ip4_forward.c
+++ b/src/vnet/ip/ip4_forward.c
@@ -1203,7 +1203,7 @@
   s = format (s, "\n%U%U",
 	      format_white_space, indent,
 	      format_ip_adjacency_packet_data,
-	      t->dpo_index, t->packet_data, sizeof (t->packet_data));
+	      t->packet_data, sizeof (t->packet_data));
   return s;
 }
 
@@ -2293,7 +2293,8 @@
 
 always_inline void
 ip4_mtu_check (vlib_buffer_t * b, u16 packet_len,
-	       u16 adj_packet_bytes, bool df, u16 * next, u32 * error)
+	       u16 adj_packet_bytes, bool df, u16 * next, u32 * error,
+	       u8 is_midchain)
 {
   if (packet_len > adj_packet_bytes)
     {
@@ -2310,12 +2311,39 @@
 	{
 	  /* IP fragmentation */
 	  ip_frag_set_vnet_buffer (b, adj_packet_bytes,
-				   IP4_FRAG_NEXT_IP4_REWRITE, 0);
+				   (is_midchain ?
+				    IP4_FRAG_NEXT_IP4_REWRITE_MIDCHAIN :
+				    IP4_FRAG_NEXT_IP4_REWRITE), 0);
 	  *next = IP4_REWRITE_NEXT_FRAGMENT;
 	}
     }
 }
 
+/* increment TTL & update checksum.
+   Works either endian, so no need for byte swap. */
+static_always_inline void
+ip4_ttl_inc (vlib_buffer_t * b, ip4_header_t * ip)
+{
+  i32 ttl;
+  u32 checksum;
+  if (PREDICT_FALSE (b->flags & VNET_BUFFER_F_LOCALLY_ORIGINATED))
+    {
+      b->flags &= ~VNET_BUFFER_F_LOCALLY_ORIGINATED;
+      return;
+    }
+
+  ttl = ip->ttl;
+
+  checksum = ip->checksum - clib_host_to_net_u16 (0x0100);
+  checksum += checksum >= 0xffff;
+
+  ip->checksum = checksum;
+  ttl += 1;
+  ip->ttl = ttl;
+
+  ASSERT (ip->checksum == ip4_header_checksum (ip));
+}
+
 /* Decrement TTL & update checksum.
    Works either endian, so no need for byte swap. */
 static_always_inline void
@@ -2458,12 +2486,12 @@
 		     adj0[0].rewrite_header.max_l3_packet_bytes,
 		     ip0->flags_and_fragment_offset &
 		     clib_host_to_net_u16 (IP4_HEADER_FLAG_DONT_FRAGMENT),
-		     next + 0, &error0);
+		     next + 0, &error0, is_midchain);
       ip4_mtu_check (b[1], ip1_len,
 		     adj1[0].rewrite_header.max_l3_packet_bytes,
 		     ip1->flags_and_fragment_offset &
 		     clib_host_to_net_u16 (IP4_HEADER_FLAG_DONT_FRAGMENT),
-		     next + 1, &error1);
+		     next + 1, &error1, is_midchain);
 
       if (is_mcast)
 	{
@@ -2481,6 +2509,7 @@
 	{
 	  u32 next_index = adj0[0].rewrite_header.next_index;
 	  vlib_buffer_advance (b[0], -(word) rw_len0);
+
 	  tx_sw_if_index0 = adj0[0].rewrite_header.sw_if_index;
 	  vnet_buffer (b[0])->sw_if_index[VLIB_TX] = tx_sw_if_index0;
 
@@ -2489,10 +2518,14 @@
 	    vnet_feature_arc_start (lm->output_feature_arc_index,
 				    tx_sw_if_index0, &next_index, b[0]);
 	  next[0] = next_index;
+	  if (is_midchain)
+	    calc_checksums (vm, b[0]);
 	}
       else
 	{
 	  b[0]->error = error_node->errors[error0];
+	  if (error0 == IP4_ERROR_MTU_EXCEEDED)
+	    ip4_ttl_inc (b[0], ip0);
 	}
       if (PREDICT_TRUE (error1 == IP4_ERROR_NONE))
 	{
@@ -2507,57 +2540,58 @@
 	    vnet_feature_arc_start (lm->output_feature_arc_index,
 				    tx_sw_if_index1, &next_index, b[1]);
 	  next[1] = next_index;
+	  if (is_midchain)
+	    calc_checksums (vm, b[1]);
 	}
       else
 	{
 	  b[1]->error = error_node->errors[error1];
+	  if (error1 == IP4_ERROR_MTU_EXCEEDED)
+	    ip4_ttl_inc (b[1], ip1);
 	}
-      if (is_midchain)
-	{
-	  calc_checksums (vm, b[0]);
-	  calc_checksums (vm, b[1]);
-	}
+
       /* Guess we are only writing on simple Ethernet header. */
       vnet_rewrite_two_headers (adj0[0], adj1[0],
 				ip0, ip1, sizeof (ethernet_header_t));
 
-      /*
-       * Bump the per-adjacency counters
-       */
       if (do_counters)
 	{
-	  vlib_increment_combined_counter
-	    (&adjacency_counters,
-	     thread_index,
-	     adj_index0, 1, vlib_buffer_length_in_chain (vm, b[0]) + rw_len0);
+	  if (error0 == IP4_ERROR_NONE)
+	    vlib_increment_combined_counter
+	      (&adjacency_counters,
+	       thread_index,
+	       adj_index0, 1,
+	       vlib_buffer_length_in_chain (vm, b[0]) + rw_len0);
 
-	  vlib_increment_combined_counter
-	    (&adjacency_counters,
-	     thread_index,
-	     adj_index1, 1, vlib_buffer_length_in_chain (vm, b[1]) + rw_len1);
+	  if (error1 == IP4_ERROR_NONE)
+	    vlib_increment_combined_counter
+	      (&adjacency_counters,
+	       thread_index,
+	       adj_index1, 1,
+	       vlib_buffer_length_in_chain (vm, b[1]) + rw_len1);
 	}
 
       if (is_midchain)
 	{
-	  if (adj0->sub_type.midchain.fixup_func)
+	  if (error0 == IP4_ERROR_NONE && adj0->sub_type.midchain.fixup_func)
 	    adj0->sub_type.midchain.fixup_func
 	      (vm, adj0, b[0], adj0->sub_type.midchain.fixup_data);
-	  if (adj1->sub_type.midchain.fixup_func)
+	  if (error1 == IP4_ERROR_NONE && adj1->sub_type.midchain.fixup_func)
 	    adj1->sub_type.midchain.fixup_func
 	      (vm, adj1, b[1], adj1->sub_type.midchain.fixup_data);
 	}
 
       if (is_mcast)
 	{
-	  /*
-	   * copy bytes from the IP address into the MAC rewrite
-	   */
-	  vnet_ip_mcast_fixup_header (IP4_MCAST_ADDR_MASK,
-				      adj0->rewrite_header.dst_mcast_offset,
-				      &ip0->dst_address.as_u32, (u8 *) ip0);
-	  vnet_ip_mcast_fixup_header (IP4_MCAST_ADDR_MASK,
-				      adj1->rewrite_header.dst_mcast_offset,
-				      &ip1->dst_address.as_u32, (u8 *) ip1);
+	  /* copy bytes from the IP address into the MAC rewrite */
+	  if (error0 == IP4_ERROR_NONE)
+	    vnet_ip_mcast_fixup_header (IP4_MCAST_ADDR_MASK,
+					adj0->rewrite_header.dst_mcast_offset,
+					&ip0->dst_address.as_u32, (u8 *) ip0);
+	  if (error1 == IP4_ERROR_NONE)
+	    vnet_ip_mcast_fixup_header (IP4_MCAST_ADDR_MASK,
+					adj1->rewrite_header.dst_mcast_offset,
+					&ip1->dst_address.as_u32, (u8 *) ip1);
 	}
 
       next += 2;
@@ -2626,7 +2660,7 @@
 		     adj0[0].rewrite_header.max_l3_packet_bytes,
 		     ip0->flags_and_fragment_offset &
 		     clib_host_to_net_u16 (IP4_HEADER_FLAG_DONT_FRAGMENT),
-		     next + 0, &error0);
+		     next + 0, &error0, is_midchain);
 
       if (is_mcast)
 	{
@@ -2649,44 +2683,38 @@
 	    vnet_feature_arc_start (lm->output_feature_arc_index,
 				    tx_sw_if_index0, &next_index, b[0]);
 	  next[0] = next_index;
+
+	  if (is_midchain)
+	    calc_checksums (vm, b[0]);
+
+	  /* Guess we are only writing on simple Ethernet header. */
+	  vnet_rewrite_one_header (adj0[0], ip0, sizeof (ethernet_header_t));
+
+	  /*
+	   * Bump the per-adjacency counters
+	   */
+	  if (do_counters)
+	    vlib_increment_combined_counter
+	      (&adjacency_counters,
+	       thread_index,
+	       adj_index0, 1, vlib_buffer_length_in_chain (vm,
+							   b[0]) + rw_len0);
+
+	  if (is_midchain && adj0->sub_type.midchain.fixup_func)
+	    adj0->sub_type.midchain.fixup_func
+	      (vm, adj0, b[0], adj0->sub_type.midchain.fixup_data);
+
+	  if (is_mcast)
+	    /* copy bytes from the IP address into the MAC rewrite */
+	    vnet_ip_mcast_fixup_header (IP4_MCAST_ADDR_MASK,
+					adj0->rewrite_header.dst_mcast_offset,
+					&ip0->dst_address.as_u32, (u8 *) ip0);
 	}
       else
 	{
 	  b[0]->error = error_node->errors[error0];
-	}
-      if (is_midchain)
-	{
-	  calc_checksums (vm, b[0]);
-	}
-      /* Guess we are only writing on simple Ethernet header. */
-      vnet_rewrite_one_header (adj0[0], ip0, sizeof (ethernet_header_t));
-
-      /*
-       * Bump the per-adjacency counters
-       */
-      if (do_counters)
-	{
-	  vlib_increment_combined_counter
-	    (&adjacency_counters,
-	     thread_index,
-	     adj_index0, 1, vlib_buffer_length_in_chain (vm, b[0]) + rw_len0);
-	}
-
-      if (is_midchain)
-	{
-	  if (adj0->sub_type.midchain.fixup_func)
-	    adj0->sub_type.midchain.fixup_func
-	      (vm, adj0, b[0], adj0->sub_type.midchain.fixup_data);
-	}
-
-      if (is_mcast)
-	{
-	  /*
-	   * copy bytes from the IP address into the MAC rewrite
-	   */
-	  vnet_ip_mcast_fixup_header (IP4_MCAST_ADDR_MASK,
-				      adj0->rewrite_header.dst_mcast_offset,
-				      &ip0->dst_address.as_u32, (u8 *) ip0);
+	  if (error0 == IP4_ERROR_MTU_EXCEEDED)
+	    ip4_ttl_inc (b[0], ip0);
 	}
 
       next += 1;
@@ -2730,7 +2758,7 @@
 		     adj0[0].rewrite_header.max_l3_packet_bytes,
 		     ip0->flags_and_fragment_offset &
 		     clib_host_to_net_u16 (IP4_HEADER_FLAG_DONT_FRAGMENT),
-		     next + 0, &error0);
+		     next + 0, &error0, is_midchain);
 
       if (is_mcast)
 	{
@@ -2753,39 +2781,36 @@
 	    vnet_feature_arc_start (lm->output_feature_arc_index,
 				    tx_sw_if_index0, &next_index, b[0]);
 	  next[0] = next_index;
+
+	  if (is_midchain)
+	    /* this acts on the packet that is about to be encapped */
+	    calc_checksums (vm, b[0]);
+
+	  /* Guess we are only writing on simple Ethernet header. */
+	  vnet_rewrite_one_header (adj0[0], ip0, sizeof (ethernet_header_t));
+
+	  if (do_counters)
+	    vlib_increment_combined_counter
+	      (&adjacency_counters,
+	       thread_index, adj_index0, 1,
+	       vlib_buffer_length_in_chain (vm, b[0]) + rw_len0);
+
+	  if (is_midchain && adj0->sub_type.midchain.fixup_func)
+	    adj0->sub_type.midchain.fixup_func
+	      (vm, adj0, b[0], adj0->sub_type.midchain.fixup_data);
+
+	  if (is_mcast)
+	    /* copy bytes from the IP address into the MAC rewrite */
+	    vnet_ip_mcast_fixup_header (IP4_MCAST_ADDR_MASK,
+					adj0->rewrite_header.dst_mcast_offset,
+					&ip0->dst_address.as_u32, (u8 *) ip0);
 	}
       else
 	{
 	  b[0]->error = error_node->errors[error0];
-	}
-      if (is_midchain)
-	{
-	  calc_checksums (vm, b[0]);
-	}
-      /* Guess we are only writing on simple Ethernet header. */
-      vnet_rewrite_one_header (adj0[0], ip0, sizeof (ethernet_header_t));
-
-      if (do_counters)
-	vlib_increment_combined_counter
-	  (&adjacency_counters,
-	   thread_index, adj_index0, 1,
-	   vlib_buffer_length_in_chain (vm, b[0]) + rw_len0);
-
-      if (is_midchain)
-	{
-	  if (adj0->sub_type.midchain.fixup_func)
-	    adj0->sub_type.midchain.fixup_func
-	      (vm, adj0, b[0], adj0->sub_type.midchain.fixup_data);
-	}
-
-      if (is_mcast)
-	{
-	  /*
-	   * copy bytes from the IP address into the MAC rewrite
-	   */
-	  vnet_ip_mcast_fixup_header (IP4_MCAST_ADDR_MASK,
-				      adj0->rewrite_header.dst_mcast_offset,
-				      &ip0->dst_address.as_u32, (u8 *) ip0);
+	  /* undo the TTL decrement - we'll be back to do it again */
+	  if (error0 == IP4_ERROR_MTU_EXCEEDED)
+	    ip4_ttl_inc (b[0], ip0);
 	}
 
       next += 1;
@@ -2943,8 +2968,8 @@
 VLIB_REGISTER_NODE (ip4_midchain_node) = {
   .name = "ip4-midchain",
   .vector_size = sizeof (u32),
-  .format_trace = format_ip4_forward_next_trace,
-  .sibling_of =  "ip4-rewrite",
+  .format_trace = format_ip4_rewrite_trace,
+  .sibling_of = "ip4-rewrite",
 };
 /* *INDENT-ON */
 
diff --git a/src/vnet/ip/ip6_forward.c b/src/vnet/ip/ip6_forward.c
index 47fb57a..50de501 100644
--- a/src/vnet/ip/ip6_forward.c
+++ b/src/vnet/ip/ip6_forward.c
@@ -908,7 +908,7 @@
   s = format (s, "\n%U%U",
 	      format_white_space, indent,
 	      format_ip_adjacency_packet_data,
-	      t->adj_index, t->packet_data, sizeof (t->packet_data));
+	      t->packet_data, sizeof (t->packet_data));
   return s;
 }
 
diff --git a/src/vnet/ip/ip_frag.c b/src/vnet/ip/ip_frag.c
index 230722c..54efb63 100644
--- a/src/vnet/ip/ip_frag.c
+++ b/src/vnet/ip/ip_frag.c
@@ -200,6 +200,17 @@
       clib_memcpy_fast (to_b->data, org_from_packet, sizeof (ip4_header_t));
       to_ip4 = vlib_buffer_get_current (to_b);
       to_data = (void *) (to_ip4 + 1);
+      vnet_buffer (to_b)->l3_hdr_offset = to_b->current_data;
+      to_b->flags |= VNET_BUFFER_F_L3_HDR_OFFSET_VALID;
+
+      if (from_b->flags & VNET_BUFFER_F_L4_HDR_OFFSET_VALID)
+	{
+	  vnet_buffer (to_b)->l4_hdr_offset =
+	    (vnet_buffer (to_b)->l3_hdr_offset +
+	     (vnet_buffer (from_b)->l4_hdr_offset -
+	      vnet_buffer (from_b)->l3_hdr_offset));
+	  to_b->flags |= VNET_BUFFER_F_L4_HDR_OFFSET_VALID;
+	}
 
       /* Spin through from buffers filling up the to buffer */
       u16 left_in_to_buffer = len, to_ptr = 0;
@@ -232,6 +243,7 @@
 	}
 
       to_b->current_length = len + sizeof (ip4_header_t);
+      to_b->flags |= VNET_BUFFER_F_IS_IP4;
 
       to_ip4->fragment_id = ip_frag_id;
       to_ip4->flags_and_fragment_offset =
@@ -241,6 +253,9 @@
       to_ip4->length = clib_host_to_net_u16 (len + sizeof (ip4_header_t));
       to_ip4->checksum = ip4_header_checksum (to_ip4);
 
+      /* we've just done the IP checksum .. */
+      to_b->flags &= ~VNET_BUFFER_F_OFFLOAD_IP_CKSUM;
+
       if (vnet_buffer (org_from_b)->ip_frag.flags & IP_FRAG_FLAG_IP4_HEADER)
 	{
 	  /* Encapsulating ipv4 header */
@@ -482,6 +497,19 @@
       to_frag_hdr = (ip6_frag_hdr_t *) (to_ip6 + 1);
       to_data = (void *) (to_frag_hdr + 1);
 
+      vnet_buffer (to_b)->l3_hdr_offset = to_b->current_data;
+      to_b->flags |= VNET_BUFFER_F_L3_HDR_OFFSET_VALID;
+
+      if (from_b->flags & VNET_BUFFER_F_L4_HDR_OFFSET_VALID)
+	{
+	  vnet_buffer (to_b)->l4_hdr_offset =
+	    (vnet_buffer (to_b)->l3_hdr_offset +
+	     (vnet_buffer (from_b)->l4_hdr_offset -
+	      vnet_buffer (from_b)->l3_hdr_offset));
+	  to_b->flags |= VNET_BUFFER_F_L4_HDR_OFFSET_VALID;
+	}
+      to_b->flags |= VNET_BUFFER_F_IS_IP6;
+
       /* Spin through from buffers filling up the to buffer */
       u16 left_in_to_buffer = len, to_ptr = 0;
       while (1)
@@ -551,6 +579,7 @@
   .n_next_nodes = IP4_FRAG_N_NEXT,
   .next_nodes = {
     [IP4_FRAG_NEXT_IP4_REWRITE] = "ip4-rewrite",
+    [IP4_FRAG_NEXT_IP4_REWRITE_MIDCHAIN] = "ip4-midchain",
     [IP4_FRAG_NEXT_IP4_LOOKUP] = "ip4-lookup",
     [IP4_FRAG_NEXT_IP6_LOOKUP] = "ip6-lookup",
     [IP4_FRAG_NEXT_MPLS_OUTPUT] = "mpls-output",
@@ -574,6 +603,7 @@
   .n_next_nodes = IP6_FRAG_N_NEXT,
   .next_nodes = {
     [IP6_FRAG_NEXT_IP6_REWRITE] = "ip6-rewrite",
+    [IP6_FRAG_NEXT_IP6_REWRITE_MIDCHAIN] = "ip6-midchain",
     [IP6_FRAG_NEXT_IP4_LOOKUP] = "ip4-lookup",
     [IP6_FRAG_NEXT_IP6_LOOKUP] = "ip6-lookup",
     [IP6_FRAG_NEXT_MPLS_OUTPUT] = "mpls-output",
diff --git a/src/vnet/ip/ip_frag.h b/src/vnet/ip/ip_frag.h
index b66db41..ce4236b 100644
--- a/src/vnet/ip/ip_frag.h
+++ b/src/vnet/ip/ip_frag.h
@@ -50,6 +50,7 @@
 typedef enum
 {
   IP4_FRAG_NEXT_IP4_REWRITE,
+  IP4_FRAG_NEXT_IP4_REWRITE_MIDCHAIN,
   IP4_FRAG_NEXT_IP4_LOOKUP,
   IP4_FRAG_NEXT_IP6_LOOKUP,
   IP4_FRAG_NEXT_MPLS_OUTPUT,
@@ -63,6 +64,7 @@
   IP6_FRAG_NEXT_IP4_LOOKUP,
   IP6_FRAG_NEXT_IP6_LOOKUP,
   IP6_FRAG_NEXT_IP6_REWRITE,
+  IP6_FRAG_NEXT_IP6_REWRITE_MIDCHAIN,
   IP6_FRAG_NEXT_MPLS_OUTPUT,
   IP6_FRAG_NEXT_DROP,
   IP6_FRAG_N_NEXT
diff --git a/src/vnet/ip/lookup.c b/src/vnet/ip/lookup.c
index 4db7660..c1fbc42 100644
--- a/src/vnet/ip/lookup.c
+++ b/src/vnet/ip/lookup.c
@@ -258,27 +258,10 @@
 u8 *
 format_ip_adjacency_packet_data (u8 * s, va_list * args)
 {
-  u32 adj_index = va_arg (*args, u32);
   u8 *packet_data = va_arg (*args, u8 *);
   u32 n_packet_data_bytes = va_arg (*args, u32);
-  ip_adjacency_t *adj;
 
-  if (!adj_is_valid (adj_index))
-    return format (s, "<invalid adjacency>");
-
-  adj = adj_get (adj_index);
-
-  switch (adj->lookup_next_index)
-    {
-    case IP_LOOKUP_NEXT_REWRITE:
-    case IP_LOOKUP_NEXT_MCAST:
-      s =
-	format (s, "%U", format_hex_bytes, packet_data, n_packet_data_bytes);
-      break;
-
-    default:
-      break;
-    }
+  s = format (s, "%U", format_hex_bytes, packet_data, n_packet_data_bytes);
 
   return s;
 }