IP fragmentation to handle buffer chains.

Change-Id: Iff557f566ebc9ab170d75da1233997d83b8c8a66
Signed-off-by: Ole Troan <ot@cisco.com>
diff --git a/src/vnet/ip/ip_frag.c b/src/vnet/ip/ip_frag.c
index 8d495af..667a92b 100644
--- a/src/vnet/ip/ip_frag.c
+++ b/src/vnet/ip/ip_frag.c
@@ -46,33 +46,41 @@
 
 static u32 running_fragment_id;
 
+/*
+ * Limitation: Does follow buffer chains in the packet to fragment,
+ * but does not generate buffer chains. I.e. a fragment is always
+ * contained with in a single buffer and limited to the max buffer
+ * size.
+ */
 void
-ip4_frag_do_fragment (vlib_main_t * vm, u32 pi, u32 ** buffer,
+ip4_frag_do_fragment (vlib_main_t * vm, u32 from_bi, u32 ** buffer,
 		      ip_frag_error_t * error)
 {
-  vlib_buffer_t *p;
+  vlib_buffer_t *from_b;
   ip4_header_t *ip4;
-  u16 mtu, ptr, len, max, rem, offset, ip_frag_id, ip_frag_offset;
-  u8 *packet, more;
+  u16 mtu, len, max, rem, offset, ip_frag_id, ip_frag_offset;
+  u8 *org_from_packet, more;
 
-  vec_add1 (*buffer, pi);
-  p = vlib_get_buffer (vm, pi);
-  offset = vnet_buffer (p)->ip_frag.header_offset;
-  mtu = vnet_buffer (p)->ip_frag.mtu;
-  packet = (u8 *) vlib_buffer_get_current (p);
-  ip4 = (ip4_header_t *) (packet + offset);
+  from_b = vlib_get_buffer (vm, from_bi);
+  offset = vnet_buffer (from_b)->ip_frag.header_offset;
+  mtu = vnet_buffer (from_b)->ip_frag.mtu;
+  org_from_packet = vlib_buffer_get_current (from_b);
+  ip4 = (ip4_header_t *) vlib_buffer_get_current (from_b) + offset;
 
-  rem = clib_net_to_host_u16 (ip4->length) - sizeof (*ip4);
-  ptr = 0;
-  max = (mtu - sizeof (*ip4) - vnet_buffer (p)->ip_frag.header_offset) & ~0x7;
+  rem = clib_net_to_host_u16 (ip4->length) - sizeof (ip4_header_t);
+  max =
+    (mtu - sizeof (ip4_header_t) -
+     vnet_buffer (from_b)->ip_frag.header_offset) & ~0x7;
 
-  if (rem > (p->current_length - offset - sizeof (*ip4)))
+  if (rem >
+      (vlib_buffer_length_in_chain (vm, from_b) - offset -
+       sizeof (ip4_header_t)))
     {
       *error = IP_FRAG_ERROR_MALFORMED;
       return;
     }
 
-  if (mtu < sizeof (*ip4))
+  if (mtu < sizeof (ip4_header_t))
     {
       *error = IP_FRAG_ERROR_CANT_FRAGMENT_HEADER;
       return;
@@ -85,12 +93,6 @@
       return;
     }
 
-  if (p->flags & VLIB_BUFFER_NEXT_PRESENT)
-    {
-      *error = IP_FRAG_ERROR_MALFORMED;
-      return;
-    }
-
   if (ip4_is_fragment (ip4))
     {
       ip_frag_id = ip4->fragment_id;
@@ -106,84 +108,109 @@
       more = 0;
     }
 
-  //Do the actual fragmentation
+  u8 *from_data = (void *) (ip4 + 1);
+  vlib_buffer_t *org_from_b = from_b;
+  u16 ptr = 0, fo = 0;
+  u16 left_in_from_buffer =
+    from_b->current_length - offset - sizeof (ip4_header_t);
+
+  /* Do the actual fragmentation */
   while (rem)
     {
-      u32 bi;
-      vlib_buffer_t *b;
-      ip4_header_t *fip4;
+      u32 to_bi;
+      vlib_buffer_t *to_b;
+      ip4_header_t *to_ip4;
+      u8 *to_data;
 
-      len =
-	(rem >
-	 (mtu - sizeof (*ip4) -
-	  vnet_buffer (p)->ip_frag.header_offset)) ? max : rem;
-
-      if (ptr == 0)
+      len = (rem > (mtu - sizeof (ip4_header_t) - offset) ? max : rem);
+      if (len != rem)		/* Last fragment does not need to divisible by 8 */
+	len &= ~0x7;
+      if (!vlib_buffer_alloc (vm, &to_bi, 1))
 	{
-	  bi = pi;
-	  b = p;
-	  fip4 = (ip4_header_t *) (vlib_buffer_get_current (b) + offset);
+	  *error = IP_FRAG_ERROR_MEMORY;
+	  /* XXX: Free already allocated buffers? */
+	  return;
+	}
+      vec_add1 (*buffer, to_bi);
+      to_b = vlib_get_buffer (vm, to_bi);
+      vnet_buffer (to_b)->sw_if_index[VLIB_RX] =
+	vnet_buffer (org_from_b)->sw_if_index[VLIB_RX];
+      vnet_buffer (to_b)->sw_if_index[VLIB_TX] =
+	vnet_buffer (org_from_b)->sw_if_index[VLIB_TX];
+      /* Copy adj_index in case DPO based node is sending for the
+       * fragmentation, the packet would be sent back to the proper
+       * DPO next node and Index
+       */
+      vnet_buffer (to_b)->ip.adj_index[VLIB_RX] =
+	vnet_buffer (org_from_b)->ip.adj_index[VLIB_RX];
+      vnet_buffer (to_b)->ip.adj_index[VLIB_TX] =
+	vnet_buffer (org_from_b)->ip.adj_index[VLIB_TX];
+
+      /* Copy offset and ip4 header */
+      clib_memcpy (to_b->data, org_from_packet,
+		   offset + sizeof (ip4_header_t));
+      to_ip4 = vlib_buffer_get_current (to_b) + offset;
+      to_data = (void *) (to_ip4 + 1);
+
+      /* Spin through buffer chain copying data */
+      // XXX: Make sure we don't overflow source buffer!!!
+      if (len > left_in_from_buffer)
+	{
+	  clib_memcpy (to_data, from_data + ptr, left_in_from_buffer);
+
+	  /* Move buffer */
+	  if (!(from_b->flags & VLIB_BUFFER_NEXT_PRESENT))
+	    {
+	      *error = IP_FRAG_ERROR_MALFORMED;
+	      return;
+	    }
+	  from_b = vlib_get_buffer (vm, from_b->next_buffer);
+	  from_data = (u8 *) vlib_buffer_get_current (from_b);
+	  clib_memcpy (to_data + left_in_from_buffer, from_data,
+		       len - left_in_from_buffer);
+	  ptr = len - left_in_from_buffer;
+	  left_in_from_buffer =
+	    from_b->current_length - (len - left_in_from_buffer);
 	}
       else
 	{
-	  if (!vlib_buffer_alloc (vm, &bi, 1))
-	    {
-	      *error = IP_FRAG_ERROR_MEMORY;
-	      return;
-	    }
-	  vec_add1 (*buffer, bi);
-	  b = vlib_get_buffer (vm, bi);
-	  vnet_buffer (b)->sw_if_index[VLIB_RX] =
-	    vnet_buffer (p)->sw_if_index[VLIB_RX];
-	  vnet_buffer (b)->sw_if_index[VLIB_TX] =
-	    vnet_buffer (p)->sw_if_index[VLIB_TX];
-	  /* Copy Adj_index in case DPO based node is sending for the fragmentation,
-	     the packet would be sent back to the proper DPO next node and Index */
-	  vnet_buffer (b)->ip.adj_index[VLIB_RX] =
-	    vnet_buffer (p)->ip.adj_index[VLIB_RX];
-	  vnet_buffer (b)->ip.adj_index[VLIB_TX] =
-	    vnet_buffer (p)->ip.adj_index[VLIB_TX];
-	  fip4 = (ip4_header_t *) (vlib_buffer_get_current (b) + offset);
-
-	  //Copy offset and ip4 header
-	  clib_memcpy (b->data, packet, offset + sizeof (*ip4));
-	  //Copy data
-	  clib_memcpy (((u8 *) (fip4)) + sizeof (*fip4),
-		       packet + offset + sizeof (*fip4) + ptr, len);
+	  clib_memcpy (to_data, from_data + ptr, len);
+	  left_in_from_buffer -= len;
+	  ptr += len;
 	}
-      b->current_length = offset + len + sizeof (*fip4);
+      to_b->current_length = offset + len + sizeof (ip4_header_t);
 
-      fip4->fragment_id = ip_frag_id;
-      fip4->flags_and_fragment_offset =
-	clib_host_to_net_u16 ((ptr >> 3) + ip_frag_offset);
-      fip4->flags_and_fragment_offset |=
+      to_ip4->fragment_id = ip_frag_id;
+      to_ip4->flags_and_fragment_offset =
+	clib_host_to_net_u16 ((fo >> 3) + ip_frag_offset);
+      to_ip4->flags_and_fragment_offset |=
 	clib_host_to_net_u16 (((len != rem) || more) << 13);
-      // ((len0 != rem0) || more0) << 13 is optimization for
-      // ((len0 != rem0) || more0) ? IP4_HEADER_FLAG_MORE_FRAGMENTS : 0
-      fip4->length = clib_host_to_net_u16 (len + sizeof (*fip4));
-      fip4->checksum = ip4_header_checksum (fip4);
+      to_ip4->length = clib_host_to_net_u16 (len + sizeof (ip4_header_t));
+      to_ip4->checksum = ip4_header_checksum (to_ip4);
 
-      if (vnet_buffer (p)->ip_frag.flags & IP_FRAG_FLAG_IP4_HEADER)
+      if (vnet_buffer (org_from_b)->ip_frag.flags & IP_FRAG_FLAG_IP4_HEADER)
 	{
-	  //Encapsulating ipv4 header
+	  /* Encapsulating ipv4 header */
 	  ip4_header_t *encap_header4 =
-	    (ip4_header_t *) vlib_buffer_get_current (b);
-	  encap_header4->length = clib_host_to_net_u16 (b->current_length);
+	    (ip4_header_t *) vlib_buffer_get_current (to_b);
+	  encap_header4->length = clib_host_to_net_u16 (to_b->current_length);
 	  encap_header4->checksum = ip4_header_checksum (encap_header4);
 	}
-      else if (vnet_buffer (p)->ip_frag.flags & IP_FRAG_FLAG_IP6_HEADER)
+      else if (vnet_buffer (org_from_b)->
+	       ip_frag.flags & IP_FRAG_FLAG_IP6_HEADER)
 	{
-	  //Encapsulating ipv6 header
+	  /* Encapsulating ipv6 header */
 	  ip6_header_t *encap_header6 =
-	    (ip6_header_t *) vlib_buffer_get_current (b);
+	    (ip6_header_t *) vlib_buffer_get_current (to_b);
 	  encap_header6->payload_length =
-	    clib_host_to_net_u16 (b->current_length -
+	    clib_host_to_net_u16 (to_b->current_length -
 				  sizeof (*encap_header6));
 	}
-
       rem -= len;
-      ptr += len;
+      fo += len;
     }
+  /* Free original packet chain */
+  vlib_buffer_free_one (vm, from_bi);
 }
 
 void
diff --git a/src/vnet/ipip/ipip.h b/src/vnet/ipip/ipip.h
index 6afb188..28833df 100644
--- a/src/vnet/ipip/ipip.h
+++ b/src/vnet/ipip/ipip.h
@@ -26,11 +26,12 @@
 
 extern vnet_hw_interface_class_t ipip_hw_interface_class;
 
-#define foreach_ipip_error			\
-  /* Must be first. */				\
-  _(DECAP_PKTS, "packets decapsulated")		\
-  _(BAD_PROTOCOL, "bad protocol")		\
-  _(NO_TUNNEL, "no tunnel")
+#define foreach_ipip_error				\
+  /* Must be first. */					\
+  _(DECAP_PKTS, "packets decapsulated")			\
+  _(BAD_PROTOCOL, "bad protocol")			\
+  _(NO_TUNNEL, "no tunnel")				\
+  _(FRAGMENTED_PACKET, "fragmented outer packet")
 
 typedef enum
 {
diff --git a/src/vnet/ipip/node.c b/src/vnet/ipip/node.c
index d55b91a..60d6223 100644
--- a/src/vnet/ipip/node.c
+++ b/src/vnet/ipip/node.c
@@ -108,6 +108,14 @@
 	  else
 	    {
 	      ip40 = vlib_buffer_get_current (b0);
+	      /* Check for outer fragmentation */
+	      if (ip40->flags_and_fragment_offset &
+		  clib_host_to_net_u16 (IP4_HEADER_FLAG_MORE_FRAGMENTS))
+		{
+		  next0 = IPIP_INPUT_NEXT_DROP;
+		  b0->error = node->errors[IPIP_ERROR_FRAGMENTED_PACKET];
+		  goto drop;
+		}
 	      vlib_buffer_advance (b0, sizeof (*ip40));
 	      ip_set (&src0, &ip40->src_address, true);
 	      ip_set (&dst0, &ip40->dst_address, true);