Improve fifo allocator performance

- add option to preallocate fifos in a segment
- track active fifos with doubly linked list instead of vector
- update udp redirect test code to read fifo pointers from API call
  instead of digging them up from fifo segment header
- input-node based active-open session generator

Change-Id: I804b81e99d95f8690d17e12660c6645995e28a9a
Signed-off-by: Dave Barach <dave@barachs.net>
Signed-off-by: Florin Coras <fcoras@cisco.com>
Signed-off-by: Dave Barach <dbarach@cisco.com>
diff --git a/src/svm/svm_fifo.h b/src/svm/svm_fifo.h
index 6936916..9cb93ff 100644
--- a/src/svm/svm_fifo.h
+++ b/src/svm/svm_fifo.h
@@ -38,7 +38,7 @@
 
 #define OOO_SEGMENT_INVALID_INDEX ((u32)~0)
 
-typedef struct
+typedef struct _svm_fifo
 {
   volatile u32 cursize;		/**< current fifo size */
   u32 nitems;
@@ -62,7 +62,8 @@
   ooo_segment_t *ooo_segments;	/**< Pool of ooo segments */
   u32 ooos_list_head;		/**< Head of out-of-order linked-list */
   u32 ooos_newest;		/**< Last segment to have been updated */
-
+  struct _svm_fifo *next;	/**< next in freelist/active chain */
+  struct _svm_fifo *prev;	/**< prev in active chain */
     CLIB_CACHE_LINE_ALIGN_MARK (data);
 } svm_fifo_t;
 
diff --git a/src/svm/svm_fifo_segment.c b/src/svm/svm_fifo_segment.c
index 281fae2..eef2168 100644
--- a/src/svm/svm_fifo_segment.c
+++ b/src/svm/svm_fifo_segment.c
@@ -17,6 +17,71 @@
 
 svm_fifo_segment_main_t svm_fifo_segment_main;
 
+static void
+preallocate_fifo_pairs (svm_fifo_segment_header_t * fsh,
+			svm_fifo_segment_create_args_t * a)
+{
+  u32 rx_fifo_size, tx_fifo_size;
+  svm_fifo_t *f;
+  u8 *rx_fifo_space, *tx_fifo_space;
+  int i;
+
+  /* Parameter check */
+  if (a->rx_fifo_size == 0 || a->tx_fifo_size == 0
+      || a->preallocated_fifo_pairs == 0)
+    return;
+
+  /* Calculate space requirements */
+  rx_fifo_size = (sizeof (*f) + a->rx_fifo_size) * a->preallocated_fifo_pairs;
+  tx_fifo_size = (sizeof (*f) + a->tx_fifo_size) * a->preallocated_fifo_pairs;
+
+  /* Allocate rx fifo space. May fail. */
+  rx_fifo_space = clib_mem_alloc_aligned_at_offset
+    (rx_fifo_size, CLIB_CACHE_LINE_BYTES, 0 /* align_offset */ ,
+     0 /* os_out_of_memory */ );
+
+  /* Same for TX */
+  tx_fifo_space = clib_mem_alloc_aligned_at_offset
+    (tx_fifo_size, CLIB_CACHE_LINE_BYTES, 0 /* align_offset */ ,
+     0 /* os_out_of_memory */ );
+
+  /* Make sure it worked. Clean up if it didn't... */
+  if (rx_fifo_space == 0 || tx_fifo_space == 0)
+    {
+      if (rx_fifo_space)
+	clib_mem_free (rx_fifo_space);
+      else
+	clib_warning ("rx fifo preallocation failure: size %d npairs %d",
+		      a->rx_fifo_size, a->preallocated_fifo_pairs);
+
+      if (tx_fifo_space)
+	clib_mem_free (tx_fifo_space);
+      else
+	clib_warning ("tx fifo preallocation failure: size %d nfifos %d",
+		      a->tx_fifo_size, a->preallocated_fifo_pairs);
+      return;
+    }
+
+  /* Carve rx fifo space */
+  f = (svm_fifo_t *) rx_fifo_space;
+  for (i = 0; i < a->preallocated_fifo_pairs; i++)
+    {
+      f->next = fsh->free_fifos[FIFO_SEGMENT_RX_FREELIST];
+      fsh->free_fifos[FIFO_SEGMENT_RX_FREELIST] = f;
+      rx_fifo_space += sizeof (*f) + a->rx_fifo_size;
+      f = (svm_fifo_t *) rx_fifo_space;
+    }
+  /* Carve tx fifo space */
+  f = (svm_fifo_t *) tx_fifo_space;
+  for (i = 0; i < a->preallocated_fifo_pairs; i++)
+    {
+      f->next = fsh->free_fifos[FIFO_SEGMENT_TX_FREELIST];
+      fsh->free_fifos[FIFO_SEGMENT_TX_FREELIST] = f;
+      tx_fifo_space += sizeof (*f) + a->tx_fifo_size;
+      f = (svm_fifo_t *) tx_fifo_space;
+    }
+}
+
 /** (master) create an svm fifo segment */
 int
 svm_fifo_segment_create (svm_fifo_segment_create_args_t * a)
@@ -59,9 +124,7 @@
   s->h = fsh;
   fsh->segment_name = format (0, "%s%c", a->segment_name, 0);
 
-  /* Avoid vec_add1(...) failure when adding a fifo, etc. */
-  vec_validate (fsh->fifos, 64);
-  _vec_len (fsh->fifos) = 0;
+  preallocate_fifo_pairs (fsh, a);
 
   ssvm_pop_heap (oldheap);
 
@@ -103,6 +166,8 @@
   s->h = fsh;
   fsh->segment_name = format (0, "%s%c", a->segment_name, 0);
 
+  preallocate_fifo_pairs (fsh, a);
+
   sh->ready = 1;
   a->new_segment_index = s - sm->segments;
   return (0);
@@ -154,7 +219,8 @@
 
 svm_fifo_t *
 svm_fifo_segment_alloc_fifo (svm_fifo_segment_private_t * s,
-			     u32 data_size_in_bytes)
+			     u32 data_size_in_bytes,
+			     svm_fifo_segment_freelist_t list_index)
 {
   ssvm_shared_header_t *sh;
   svm_fifo_segment_header_t *fsh;
@@ -167,6 +233,29 @@
   ssvm_lock (sh, 1, 0);
   oldheap = ssvm_push_heap (sh);
 
+  switch (list_index)
+    {
+    case FIFO_SEGMENT_RX_FREELIST:
+    case FIFO_SEGMENT_TX_FREELIST:
+      f = fsh->free_fifos[list_index];
+      if (f)
+	{
+	  fsh->free_fifos[list_index] = f->next;
+	  /* (re)initialize the fifo, as in svm_fifo_create */
+	  memset (f, 0, sizeof (*f));
+	  f->nitems = data_size_in_bytes;
+	  f->ooos_list_head = OOO_SEGMENT_INVALID_INDEX;
+	  goto found;
+	}
+      /* FALLTHROUGH */
+    case FIFO_SEGMENT_FREELIST_NONE:
+      break;
+
+    default:
+      clib_warning ("ignore bogus freelist %d", list_index);
+      break;
+    }
+
   /* Note: this can fail, in which case: create another segment */
   f = svm_fifo_create (data_size_in_bytes);
   if (PREDICT_FALSE (f == 0))
@@ -176,37 +265,62 @@
       return (0);
     }
 
-  vec_add1 (fsh->fifos, f);
+found:
+  /* If rx_freelist add to active fifos list. When cleaning up segment,
+   * we need a list of active sessions that should be disconnected. Since
+   * both rx and tx fifos keep pointers to the session, it's enough to track
+   * only one. */
+  if (list_index == FIFO_SEGMENT_RX_FREELIST)
+    {
+      if (fsh->fifos)
+	{
+	  fsh->fifos->prev = f;
+	  f->next = fsh->fifos;
+	}
+      fsh->fifos = f;
+    }
+
   ssvm_pop_heap (oldheap);
   ssvm_unlock (sh);
   return (f);
 }
 
 void
-svm_fifo_segment_free_fifo (svm_fifo_segment_private_t * s, svm_fifo_t * f)
+svm_fifo_segment_free_fifo (svm_fifo_segment_private_t * s, svm_fifo_t * f,
+			    svm_fifo_segment_freelist_t list_index)
 {
   ssvm_shared_header_t *sh;
   svm_fifo_segment_header_t *fsh;
   void *oldheap;
-  int i;
 
   sh = s->ssvm.sh;
   fsh = (svm_fifo_segment_header_t *) sh->opaque[0];
 
   ssvm_lock (sh, 1, 0);
   oldheap = ssvm_push_heap (sh);
-  for (i = 0; i < vec_len (fsh->fifos); i++)
-    {
-      if (fsh->fifos[i] == f)
-	{
-	  vec_delete (fsh->fifos, 1, i);
-	  goto found;
-	}
-    }
-  clib_warning ("fifo 0x%llx not found in fifo table...", f);
 
-found:
-  clib_mem_free (f);
+  switch (list_index)
+    {
+    case FIFO_SEGMENT_RX_FREELIST:
+      /* Remove from active list */
+      if (f->prev)
+	f->prev->next = f->next;
+      if (f->next)
+	f->next->prev = f->prev;
+      /* FALLTHROUGH */
+    case FIFO_SEGMENT_TX_FREELIST:
+      /* Add to free list */
+      f->next = fsh->free_fifos[list_index];
+      fsh->free_fifos[list_index] = f;
+      /* FALLTHROUGH */
+    case FIFO_SEGMENT_FREELIST_NONE:
+      break;
+
+    default:
+      clib_warning ("ignore bogus freelist %d", list_index);
+      break;
+    }
+
   ssvm_pop_heap (oldheap);
   ssvm_unlock (sh);
 }
diff --git a/src/svm/svm_fifo_segment.h b/src/svm/svm_fifo_segment.h
index 4218013..31e14db 100644
--- a/src/svm/svm_fifo_segment.h
+++ b/src/svm/svm_fifo_segment.h
@@ -19,10 +19,19 @@
 #include <svm/ssvm.h>
 #include <vppinfra/lock.h>
 
+typedef enum
+{
+  FIFO_SEGMENT_FREELIST_NONE = -1,
+  FIFO_SEGMENT_RX_FREELIST = 0,
+  FIFO_SEGMENT_TX_FREELIST,
+  FIFO_SEGMENT_N_FREELISTS
+} svm_fifo_segment_freelist_t;
+
 typedef struct
 {
-  volatile svm_fifo_t **fifos;
-  u8 *segment_name;
+  svm_fifo_t *fifos;		/**< Linked list of active RX fifos */
+  u8 *segment_name;		/**< Segment name */
+  svm_fifo_t *free_fifos[FIFO_SEGMENT_N_FREELISTS];	/**< Free lists */
 } svm_fifo_segment_header_t;
 
 typedef struct
@@ -49,6 +58,9 @@
   char *segment_name;
   u32 segment_size;
   u32 new_segment_index;
+  u32 rx_fifo_size;
+  u32 tx_fifo_size;
+  u32 preallocated_fifo_pairs;
 } svm_fifo_segment_create_args_t;
 
 static inline svm_fifo_segment_private_t *
@@ -61,13 +73,13 @@
 static inline u8
 svm_fifo_segment_has_fifos (svm_fifo_segment_private_t * fifo_segment)
 {
-  return vec_len ((svm_fifo_t **) fifo_segment->h->fifos) != 0;
+  return fifo_segment->h->fifos != 0;
 }
 
-static inline svm_fifo_t **
-svm_fifo_segment_get_fifos (svm_fifo_segment_private_t * fifo_segment)
+static inline svm_fifo_t *
+svm_fifo_segment_get_fifo_list (svm_fifo_segment_private_t * fifo_segment)
 {
-  return (svm_fifo_t **) fifo_segment->h->fifos;
+  return fifo_segment->h->fifos;
 }
 
 #define foreach_ssvm_fifo_segment_api_error             \
@@ -87,9 +99,11 @@
 void svm_fifo_segment_delete (svm_fifo_segment_private_t * s);
 
 svm_fifo_t *svm_fifo_segment_alloc_fifo (svm_fifo_segment_private_t * s,
-					 u32 data_size_in_bytes);
+					 u32 data_size_in_bytes,
+					 svm_fifo_segment_freelist_t index);
 void svm_fifo_segment_free_fifo (svm_fifo_segment_private_t * s,
-				 svm_fifo_t * f);
+				 svm_fifo_t * f,
+				 svm_fifo_segment_freelist_t index);
 void svm_fifo_segment_init (u64 baseva, u32 timeout_in_seconds);
 u32 svm_fifo_segment_index (svm_fifo_segment_private_t * s);
 
diff --git a/src/svm/test_svm_fifo1.c b/src/svm/test_svm_fifo1.c
index 398dd6d..63b4a9b 100644
--- a/src/svm/test_svm_fifo1.c
+++ b/src/svm/test_svm_fifo1.c
@@ -30,6 +30,9 @@
 
   a->segment_name = "fifo-test1";
   a->segment_size = 256 << 10;
+  a->rx_fifo_size = 4096;
+  a->tx_fifo_size = 4096;
+  a->preallocated_fifo_pairs = 4;
 
   rv = svm_fifo_segment_create (a);
 
@@ -38,7 +41,7 @@
 
   sp = svm_fifo_get_segment (a->new_segment_index);
 
-  f = svm_fifo_segment_alloc_fifo (sp, 4096);
+  f = svm_fifo_segment_alloc_fifo (sp, 4096, FIFO_SEGMENT_RX_FREELIST);
 
   if (f == 0)
     return clib_error_return (0, "svm_fifo_segment_alloc_fifo failed");
@@ -63,7 +66,7 @@
   else
     error = clib_error_return (0, "data test FAIL!");
 
-  svm_fifo_segment_free_fifo (sp, f);
+  svm_fifo_segment_free_fifo (sp, f, FIFO_SEGMENT_RX_FREELIST);
 
   return error;
 }
@@ -91,7 +94,7 @@
 
   sp = svm_fifo_get_segment (a->new_segment_index);
 
-  f = svm_fifo_segment_alloc_fifo (sp, 4096);
+  f = svm_fifo_segment_alloc_fifo (sp, 4096, FIFO_SEGMENT_RX_FREELIST);
 
   if (f == 0)
     return clib_error_return (0, "svm_fifo_segment_alloc_fifo failed");
@@ -129,7 +132,7 @@
 
   for (i = 0; i < 1000; i++)
     {
-      f = svm_fifo_segment_alloc_fifo (sp, 4096);
+      f = svm_fifo_segment_alloc_fifo (sp, 4096, FIFO_SEGMENT_RX_FREELIST);
       if (f == 0)
 	break;
       vec_add1 (flist, f);
@@ -139,14 +142,14 @@
   for (i = 0; i < vec_len (flist); i++)
     {
       f = flist[i];
-      svm_fifo_segment_free_fifo (sp, f);
+      svm_fifo_segment_free_fifo (sp, f, FIFO_SEGMENT_RX_FREELIST);
     }
 
   _vec_len (flist) = 0;
 
   for (i = 0; i < 1000; i++)
     {
-      f = svm_fifo_segment_alloc_fifo (sp, 4096);
+      f = svm_fifo_segment_alloc_fifo (sp, 4096, FIFO_SEGMENT_RX_FREELIST);
       if (f == 0)
 	break;
       vec_add1 (flist, f);
@@ -156,7 +159,7 @@
   for (i = 0; i < vec_len (flist); i++)
     {
       f = flist[i];
-      svm_fifo_segment_free_fifo (sp, f);
+      svm_fifo_segment_free_fifo (sp, f, FIFO_SEGMENT_RX_FREELIST);
     }
 
   return 0;
@@ -185,7 +188,7 @@
 
   sp = svm_fifo_get_segment (a->new_segment_index);
 
-  f = svm_fifo_segment_alloc_fifo (sp, 200 << 10);
+  f = svm_fifo_segment_alloc_fifo (sp, 200 << 10, FIFO_SEGMENT_RX_FREELIST);
 
   if (f == 0)
     return clib_error_return (0, "svm_fifo_segment_alloc_fifo failed");
@@ -226,9 +229,9 @@
 {
   svm_fifo_segment_create_args_t _a, *a = &_a;
   svm_fifo_segment_private_t *sp;
-  svm_fifo_segment_header_t *fsh;
   svm_fifo_t *f;
   ssvm_shared_header_t *sh;
+  svm_fifo_segment_header_t *fsh;
   int rv;
   u8 *test_data;
   u8 *retrieved_data = 0;
@@ -248,7 +251,7 @@
   fsh = (svm_fifo_segment_header_t *) sh->opaque[0];
 
   /* might wanna wait.. */
-  f = (svm_fifo_t *) fsh->fifos[0];
+  f = fsh->fifos;
 
   /* Lazy bastards united */
   test_data = format (0, "Hello world%c", 0);
diff --git a/src/uri/uri_tcp_test.c b/src/uri/uri_tcp_test.c
index 22f246e..e201a35 100755
--- a/src/uri/uri_tcp_test.c
+++ b/src/uri/uri_tcp_test.c
@@ -193,6 +193,7 @@
   bmp->context = ntohl (0xfeedface);
   bmp->options[APP_OPTIONS_FLAGS] =
     APP_OPTIONS_FLAGS_USE_FIFO | APP_OPTIONS_FLAGS_ADD_SEGMENT;
+  bmp->options[APP_OPTIONS_PREALLOC_FIFO_PAIRS] = 16;
   bmp->options[SESSION_OPTIONS_RX_FIFO_SIZE] = fifo_size;
   bmp->options[SESSION_OPTIONS_TX_FIFO_SIZE] = fifo_size;
   bmp->options[SESSION_OPTIONS_ADD_SEGMENT_SIZE] = 128 << 20;
diff --git a/src/uri/uri_udp_test.c b/src/uri/uri_udp_test.c
index 8fb12ed..45ad35a 100644
--- a/src/uri/uri_udp_test.c
+++ b/src/uri/uri_udp_test.c
@@ -176,6 +176,7 @@
   bmp->context = ntohl (0xfeedface);
   bmp->options[APP_OPTIONS_FLAGS] =
     APP_OPTIONS_FLAGS_USE_FIFO | APP_OPTIONS_FLAGS_ADD_SEGMENT;
+  bmp->options[APP_OPTIONS_PREALLOC_FIFO_PAIRS] = 16;
   bmp->options[SESSION_OPTIONS_RX_FIFO_SIZE] = fifo_size;
   bmp->options[SESSION_OPTIONS_TX_FIFO_SIZE] = fifo_size;
   bmp->options[SESSION_OPTIONS_ADD_SEGMENT_SIZE] = 128 << 20;
@@ -522,7 +523,7 @@
   svm_fifo_segment_private_t *seg;
   unix_shared_memory_queue_t *client_q;
   vl_api_connect_uri_reply_t *rmp;
-  session_t *session;
+  session_t *session = 0;
   int rv = 0;
 
   /* Create the segment */
@@ -545,17 +546,12 @@
 
   pool_get (utm->sessions, session);
 
-  /*
-   * By construction the master's idea of the rx fifo ends up in
-   * fsh->fifos[0], and the master's idea of the tx fifo ends up in
-   * fsh->fifos[1].
-   */
-  session->server_rx_fifo = svm_fifo_segment_alloc_fifo (utm->seg,
-							 128 * 1024);
+  session->server_rx_fifo = svm_fifo_segment_alloc_fifo
+    (utm->seg, 128 * 1024, FIFO_SEGMENT_RX_FREELIST);
   ASSERT (session->server_rx_fifo);
 
-  session->server_tx_fifo = svm_fifo_segment_alloc_fifo (utm->seg,
-							 128 * 1024);
+  session->server_tx_fifo = svm_fifo_segment_alloc_fifo
+    (utm->seg, 128 * 1024, FIFO_SEGMENT_TX_FREELIST);
   ASSERT (session->server_tx_fifo);
 
   session->server_rx_fifo->master_session_index = session - utm->sessions;
@@ -578,6 +574,12 @@
   rmp->context = mp->context;
   rmp->retval = ntohl (rv);
   rmp->segment_name_length = vec_len (a->segment_name);
+  if (session)
+    {
+      rmp->server_rx_fifo = pointer_to_uword (session->server_rx_fifo);
+      rmp->server_tx_fifo = pointer_to_uword (session->server_tx_fifo);
+    }
+
   memcpy (rmp->segment_name, a->segment_name, vec_len (a->segment_name));
 
   vec_free (a->segment_name);
@@ -689,9 +691,7 @@
       svm_fifo_segment_create_args_t _a, *a = &_a;
       u32 segment_index;
       session_t *session;
-      ssvm_shared_header_t *sh;
       svm_fifo_segment_private_t *seg;
-      svm_fifo_segment_header_t *fsh;
       int rv;
 
       memset (a, 0, sizeof (*a));
@@ -707,22 +707,19 @@
 	  return;
 	}
 
-      segment_index = vec_len (sm->segments) - 1;
+      segment_index = a->new_segment_index;
       vec_add2 (utm->seg, seg, 1);
-
       memcpy (seg, sm->segments + segment_index, sizeof (*seg));
-      sh = seg->ssvm.sh;
-      fsh = (svm_fifo_segment_header_t *) sh->opaque[0];
-
-      while (vec_len (fsh->fifos) < 2)
-	sleep (1);
+      sleep (1);
 
       pool_get (utm->sessions, session);
       utm->cut_through_session_index = session - utm->sessions;
 
-      session->server_rx_fifo = (svm_fifo_t *) fsh->fifos[0];
+      session->server_rx_fifo = uword_to_pointer (mp->server_rx_fifo,
+						  svm_fifo_t *);
       ASSERT (session->server_rx_fifo);
-      session->server_tx_fifo = (svm_fifo_t *) fsh->fifos[1];
+      session->server_tx_fifo = uword_to_pointer (mp->server_tx_fifo,
+						  svm_fifo_t *);
       ASSERT (session->server_tx_fifo);
     }
 
diff --git a/src/vlibapi/api.h b/src/vlibapi/api.h
index 3403e1c..0e2c210 100644
--- a/src/vlibapi/api.h
+++ b/src/vlibapi/api.h
@@ -193,6 +193,9 @@
 
   i32 vlib_signal;
 
+  /* vlib input queue length */
+  u32 vlib_input_queue_length;
+
   /* client side message index hash table */
   uword *msg_index_by_name_and_crc;
 
diff --git a/src/vlibmemory/memory_shared.c b/src/vlibmemory/memory_shared.c
index aea9033..41aa123 100644
--- a/src/vlibmemory/memory_shared.c
+++ b/src/vlibmemory/memory_shared.c
@@ -104,8 +104,17 @@
 	      if (now - rv->gc_mark_timestamp > 10)
 		{
 		  if (CLIB_DEBUG > 0)
-		    clib_warning ("garbage collect pool %d ring %d index %d",
-				  pool, i, q->head);
+		    {
+		      u16 *msg_idp, msg_id;
+		      clib_warning
+			("garbage collect pool %d ring %d index %d", pool, i,
+			 q->head);
+		      msg_idp = (u16 *) (rv->data);
+		      msg_id = clib_net_to_host_u16 (*msg_idp);
+		      if (msg_id < vec_len (api_main.msg_names))
+			clib_warning ("msg id %d name %s", (u32) msg_id,
+				      api_main.msg_names[msg_id]);
+		    }
 		  shmem_hdr->garbage_collects++;
 		  goto collected;
 		}
@@ -330,6 +339,7 @@
   api_main_t *am = &api_main;
   int i;
   struct timespec ts, tsrem;
+  u32 vlib_input_queue_length;
 
   if (is_vlib == 0)
     svm_region_init_chroot (am->root_path);
@@ -449,9 +459,13 @@
   shmem_hdr->version = VL_SHM_VERSION;
 
   /* vlib main input queue */
+  vlib_input_queue_length = 1024;
+  if (am->vlib_input_queue_length)
+    vlib_input_queue_length = am->vlib_input_queue_length;
+
   shmem_hdr->vl_input_queue =
-    unix_shared_memory_queue_init (1024, sizeof (uword), getpid (),
-				   am->vlib_signal);
+    unix_shared_memory_queue_init (vlib_input_queue_length, sizeof (uword),
+				   getpid (), am->vlib_signal);
 
   /* Set up the msg ring allocator */
 #define _(sz,n)                                                 \
diff --git a/src/vlibmemory/memory_vlib.c b/src/vlibmemory/memory_vlib.c
index e5d8873..004a997 100644
--- a/src/vlibmemory/memory_vlib.c
+++ b/src/vlibmemory/memory_vlib.c
@@ -1917,6 +1917,32 @@
 
 VLIB_CONFIG_FUNCTION (api_config_fn, "api-trace");
 
+static clib_error_t *
+api_queue_config_fn (vlib_main_t * vm, unformat_input_t * input)
+{
+  api_main_t *am = &api_main;
+  u32 nitems;
+
+  while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT)
+    {
+      if (unformat (input, "length %d", &nitems) ||
+	  (unformat (input, "len %d", &nitems)))
+	{
+	  if (nitems >= 1024)
+	    am->vlib_input_queue_length = nitems;
+	  else
+	    clib_warning ("vlib input queue length %d too small, ignored",
+			  nitems);
+	}
+      else
+	return clib_error_return (0, "unknown input `%U'",
+				  format_unformat_error, input);
+    }
+  return 0;
+}
+
+VLIB_CONFIG_FUNCTION (api_queue_config_fn, "api-queue");
+
 /*
  * fd.io coding-style-patch-verification: ON
  *
diff --git a/src/vnet/session/application.c b/src/vnet/session/application.c
index ccf9837..c679b1f 100644
--- a/src/vnet/session/application.c
+++ b/src/vnet/session/application.c
@@ -160,6 +160,7 @@
   props->rx_fifo_size = options[SESSION_OPTIONS_RX_FIFO_SIZE];
   props->tx_fifo_size = options[SESSION_OPTIONS_TX_FIFO_SIZE];
   props->add_segment = props->add_segment_size != 0;
+  props->preallocated_fifo_pairs = options[APP_OPTIONS_PREALLOC_FIFO_PAIRS];
   props->use_private_segment = options[APP_OPTIONS_FLAGS]
     & APP_OPTIONS_FLAGS_BUILTIN_APP;
 
@@ -395,7 +396,7 @@
   vlib_main_t *vm = vlib_get_main ();
   segment_manager_t *sm;
   u8 *app_name, *s = 0;
-  int i, j;
+  int j;
 
   /* Header */
   if (app == 0)
@@ -419,22 +420,16 @@
   for (j = 0; j < vec_len (sm->segment_indices); j++)
     {
       svm_fifo_segment_private_t *fifo_segment;
-      svm_fifo_t **fifos;
+      svm_fifo_t *fifo;
       u8 *str;
 
       fifo_segment = svm_fifo_get_segment (sm->segment_indices[j]);
-      fifos = svm_fifo_segment_get_fifos (fifo_segment);
-      for (i = 0; i < vec_len (fifos); i++)
+      fifo = svm_fifo_segment_get_fifo_list (fifo_segment);
+      while (fifo)
 	{
-	  svm_fifo_t *fifo;
 	  u32 session_index, thread_index;
 	  stream_session_t *session;
 
-	  /* There are 2 fifos/session. Avoid printing twice. */
-	  if (i % 2)
-	    continue;
-
-	  fifo = fifos[i];
 	  session_index = fifo->master_session_index;
 	  thread_index = fifo->master_thread_index;
 
@@ -448,9 +443,10 @@
 	    s = format (s, "%-40s%-20s", str, app_name);
 
 	  vlib_cli_output (vm, "%v", s);
-
 	  vec_reset_length (s);
 	  vec_free (str);
+
+	  fifo = fifo->next;
 	}
       vec_free (s);
     }
diff --git a/src/vnet/session/application_interface.h b/src/vnet/session/application_interface.h
index 7d924c1..4d6f9de 100644
--- a/src/vnet/session/application_interface.h
+++ b/src/vnet/session/application_interface.h
@@ -119,10 +119,12 @@
 {
   APP_EVT_QUEUE_SIZE,
   APP_OPTIONS_FLAGS,
+  APP_OPTIONS_PREALLOC_FIFO_PAIRS,
   SESSION_OPTIONS_SEGMENT_SIZE,
   SESSION_OPTIONS_ADD_SEGMENT_SIZE,
   SESSION_OPTIONS_RX_FIFO_SIZE,
   SESSION_OPTIONS_TX_FIFO_SIZE,
+  SESSION_OPTIONS_PREALLOCATED_FIFO_PAIRS,
   SESSION_OPTIONS_ACCEPT_COOKIE,
   SESSION_OPTIONS_N_OPTIONS
 } app_attach_options_index_t;
diff --git a/src/vnet/session/segment_manager.c b/src/vnet/session/segment_manager.c
index b13df21..caf8eaa 100644
--- a/src/vnet/session/segment_manager.c
+++ b/src/vnet/session/segment_manager.c
@@ -58,6 +58,9 @@
 
   ca->segment_name = (char *) segment_name;
   ca->segment_size = segment_size;
+  ca->rx_fifo_size = sm->properties->rx_fifo_size;
+  ca->tx_fifo_size = sm->properties->tx_fifo_size;
+  ca->preallocated_fifo_pairs = sm->properties->preallocated_fifo_pairs;
 
   rv = svm_fifo_segment_create (ca);
   if (rv)
@@ -104,7 +107,8 @@
 }
 
 static void
-segment_manager_alloc_process_private_segment ()
+  segment_manager_alloc_process_private_segment
+  (segment_manager_properties_t * props)
 {
   svm_fifo_segment_create_args_t _a, *a = &_a;
 
@@ -115,6 +119,9 @@
   a->segment_name = "process-private-segment";
   a->segment_size = ~0;
   a->new_segment_index = ~0;
+  a->rx_fifo_size = props->rx_fifo_size;
+  a->tx_fifo_size = props->tx_fifo_size;
+  a->preallocated_fifo_pairs = props->preallocated_fifo_pairs;
 
   if (svm_fifo_segment_create_process_private (a))
     clib_warning ("Failed to create process private segment");
@@ -151,7 +158,7 @@
   else
     {
       if (private_segment_index == ~0)
-	segment_manager_alloc_process_private_segment ();
+	segment_manager_alloc_process_private_segment (properties);
       ASSERT (private_segment_index != ~0);
       vec_add1 (sm->segment_indices, private_segment_index);
     }
@@ -170,74 +177,46 @@
 void
 segment_manager_del (segment_manager_t * sm)
 {
-  u32 *deleted_sessions = 0;
-  u32 *deleted_thread_indices = 0;
-  int i, j;
+  int j;
 
   /* Across all fifo segments used by the server */
   for (j = 0; j < vec_len (sm->segment_indices); j++)
     {
       svm_fifo_segment_private_t *fifo_segment;
-      svm_fifo_t **fifos;
+      svm_fifo_t *fifo;
+
       /* Vector of fifos allocated in the segment */
       fifo_segment = svm_fifo_get_segment (sm->segment_indices[j]);
-      fifos = svm_fifo_segment_get_fifos (fifo_segment);
+      fifo = svm_fifo_segment_get_fifo_list (fifo_segment);
 
       /*
        * Remove any residual sessions from the session lookup table
        * Don't bother deleting the individual fifos, we're going to
        * throw away the fifo segment in a minute.
        */
-      for (i = 0; i < vec_len (fifos); i++)
+      while (fifo)
 	{
-	  svm_fifo_t *fifo;
 	  u32 session_index, thread_index;
 	  stream_session_t *session;
 
-	  fifo = fifos[i];
 	  session_index = fifo->master_session_index;
 	  thread_index = fifo->master_thread_index;
 
 	  session = stream_session_get (session_index, thread_index);
 
-	  /* Add to the deleted_sessions vector (once!) */
-	  if (!session->is_deleted)
-	    {
-	      session->is_deleted = 1;
-	      vec_add1 (deleted_sessions, session_index);
-	      vec_add1 (deleted_thread_indices, thread_index);
-	    }
-	}
-
-      for (i = 0; i < vec_len (deleted_sessions); i++)
-	{
-	  stream_session_t *session;
-	  session = stream_session_get (deleted_sessions[i],
-					deleted_thread_indices[i]);
-
 	  /* Instead of directly removing the session call disconnect */
 	  session_send_session_evt_to_thread (stream_session_handle (session),
 					      FIFO_EVENT_DISCONNECT,
-					      deleted_thread_indices[i]);
-
-	  /*
-	     stream_session_table_del (smm, session);
-	     pool_put(smm->sessions[deleted_thread_indices[i]], session);
-	   */
+					      thread_index);
+	  fifo = fifo->next;
 	}
 
-      vec_reset_length (deleted_sessions);
-      vec_reset_length (deleted_thread_indices);
-
-      /* Instead of removing the segment, test when removing the session if
-       * the segment can be removed
+      /* Instead of removing the segment, test when cleaning up disconnected
+       * sessions if the segment can be removed.
        */
-      /* svm_fifo_segment_delete (fifo_segment); */
     }
 
   clib_spinlock_free (&sm->lockp);
-  vec_free (deleted_sessions);
-  vec_free (deleted_thread_indices);
   pool_put (segment_managers, sm);
 }
 
@@ -281,20 +260,27 @@
       *fifo_segment_index = sm->segment_indices[i];
       fifo_segment = svm_fifo_get_segment (*fifo_segment_index);
 
+      /* FC: cleanup, make sure sm->properties->xxx_fifo_size always set */
       fifo_size = sm->properties->rx_fifo_size;
       fifo_size = (fifo_size == 0) ? default_fifo_size : fifo_size;
-      *server_rx_fifo = svm_fifo_segment_alloc_fifo (fifo_segment, fifo_size);
+      *server_rx_fifo =
+	svm_fifo_segment_alloc_fifo (fifo_segment, fifo_size,
+				     FIFO_SEGMENT_RX_FREELIST);
 
+      /* FC: cleanup, make sure sm->properties->xxx_fifo_size always set */
       fifo_size = sm->properties->tx_fifo_size;
       fifo_size = (fifo_size == 0) ? default_fifo_size : fifo_size;
-      *server_tx_fifo = svm_fifo_segment_alloc_fifo (fifo_segment, fifo_size);
+      *server_tx_fifo =
+	svm_fifo_segment_alloc_fifo (fifo_segment, fifo_size,
+				     FIFO_SEGMENT_TX_FREELIST);
 
       if (*server_rx_fifo == 0)
 	{
 	  /* This would be very odd, but handle it... */
 	  if (*server_tx_fifo != 0)
 	    {
-	      svm_fifo_segment_free_fifo (fifo_segment, *server_tx_fifo);
+	      svm_fifo_segment_free_fifo (fifo_segment, *server_tx_fifo,
+					  FIFO_SEGMENT_TX_FREELIST);
 	      *server_tx_fifo = 0;
 	    }
 	  continue;
@@ -303,7 +289,8 @@
 	{
 	  if (*server_rx_fifo != 0)
 	    {
-	      svm_fifo_segment_free_fifo (fifo_segment, *server_rx_fifo);
+	      svm_fifo_segment_free_fifo (fifo_segment, *server_rx_fifo,
+					  FIFO_SEGMENT_RX_FREELIST);
 	      *server_rx_fifo = 0;
 	    }
 	  continue;
@@ -365,8 +352,10 @@
     return;
 
   fifo_segment = svm_fifo_get_segment (svm_segment_index);
-  svm_fifo_segment_free_fifo (fifo_segment, rx_fifo);
-  svm_fifo_segment_free_fifo (fifo_segment, tx_fifo);
+  svm_fifo_segment_free_fifo (fifo_segment, rx_fifo,
+			      FIFO_SEGMENT_RX_FREELIST);
+  svm_fifo_segment_free_fifo (fifo_segment, tx_fifo,
+			      FIFO_SEGMENT_TX_FREELIST);
 
   /* Remove segment only if it holds no fifos and not the first */
   if (sm->segment_indices[0] != svm_segment_index
diff --git a/src/vnet/session/segment_manager.h b/src/vnet/session/segment_manager.h
index 2710bb5..d4b7320 100644
--- a/src/vnet/session/segment_manager.h
+++ b/src/vnet/session/segment_manager.h
@@ -28,6 +28,9 @@
   u32 rx_fifo_size;
   u32 tx_fifo_size;
 
+  /** Preallocated pool sizes */
+  u32 preallocated_fifo_pairs;
+
   /** Configured additional segment size */
   u32 add_segment_size;
 
diff --git a/src/vnet/session/session.c b/src/vnet/session/session.c
index c5aaf2e..02b0cce 100644
--- a/src/vnet/session/session.c
+++ b/src/vnet/session/session.c
@@ -1048,19 +1048,21 @@
 {
   api_main_t *am = &api_main;
   void *oldheap;
+  u32 event_queue_length = 2048;
 
   if (smm->vpp_event_queues[thread_index] == 0)
     {
       /* Allocate event fifo in the /vpe-api shared-memory segment */
       oldheap = svm_push_data_heap (am->vlib_rp);
 
+      if (smm->configured_event_queue_length)
+	event_queue_length = smm->configured_event_queue_length;
+
       smm->vpp_event_queues[thread_index] =
-	unix_shared_memory_queue_init (2048 /* nels $$$$ config */ ,
-				       sizeof (session_fifo_event_t),
-				       0 /* consumer pid */ ,
-				       0
-				       /* (do not) send signal when queue non-empty */
-	);
+	unix_shared_memory_queue_init
+	(event_queue_length,
+	 sizeof (session_fifo_event_t), 0 /* consumer pid */ ,
+	 0 /* (do not) send signal when queue non-empty */ );
 
       svm_pop_heap (oldheap);
     }
@@ -1187,6 +1189,30 @@
 }
 
 VLIB_INIT_FUNCTION (session_manager_main_init)
+     static clib_error_t *session_config_fn (vlib_main_t * vm,
+					     unformat_input_t * input)
+{
+  session_manager_main_t *smm = &session_manager_main;
+  u32 nitems;
+
+  while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT)
+    {
+      if (unformat (input, "event-queue-length %d", &nitems))
+	{
+	  if (nitems >= 2048)
+	    smm->configured_event_queue_length = nitems;
+	  else
+	    clib_warning ("event queue length %d too small, ignored", nitems);
+	}
+      else
+	return clib_error_return (0, "unknown input `%U'",
+				  format_unformat_error, input);
+    }
+  return 0;
+}
+
+VLIB_CONFIG_FUNCTION (session_config_fn, "session");
+
 /*
  * fd.io coding-style-patch-verification: ON
  *
diff --git a/src/vnet/session/session.h b/src/vnet/session/session.h
index d60cca2..a872864 100644
--- a/src/vnet/session/session.h
+++ b/src/vnet/session/session.h
@@ -125,14 +125,11 @@
 
   u8 thread_index;
 
-  /** used during unbind processing */
-  u8 is_deleted;
-
   /** To avoid n**2 "one event per frame" check */
   u8 enqueue_epoch;
 
   /** Pad to a multiple of 8 octets */
-  u8 align_pad[2];
+  u8 align_pad[4];
 
   /** svm segment index where fifos were allocated */
   u32 svm_segment_index;
@@ -205,6 +202,9 @@
   /** vpp fifo event queue */
   unix_shared_memory_queue_t **vpp_event_queues;
 
+  /** vpp fifo event queue configured length */
+  u32 configured_event_queue_length;
+
   /** Unique segment name counter */
   u32 unique_segment_name_counter;
 
diff --git a/src/vnet/session/session_api.c b/src/vnet/session/session_api.c
index 8c073a0..98d6946 100755
--- a/src/vnet/session/session_api.c
+++ b/src/vnet/session/session_api.c
@@ -419,7 +419,7 @@
   REPLY_MACRO (VL_API_UNBIND_URI_REPLY);
 }
 
-static void
+void
 vl_api_connect_uri_t_handler (vl_api_connect_uri_t * mp)
 {
   vl_api_connect_uri_reply_t *rmp;
diff --git a/src/vnet/tcp/builtin_client.c b/src/vnet/tcp/builtin_client.c
index aaefa7e..768f0c3 100644
--- a/src/vnet/tcp/builtin_client.c
+++ b/src/vnet/tcp/builtin_client.c
@@ -44,8 +44,6 @@
 #undef vl_printfun
 
 #define TCP_BUILTIN_CLIENT_DBG (1)
-#define TCP_BUILTIN_CLIENT_VPP_THREAD (0)
-#define TCP_BUILTIN_CLIENT_PTHREAD (!TCP_BUILTIN_CLIENT_VPP_THREAD)
 
 static void
 send_test_chunk (tclient_main_t * tm, session_t * s)
@@ -156,131 +154,76 @@
     }
 }
 
-#if TCP_BUILTIN_CLIENT_VPP_THREAD
-#define THREAD_PROTOTYPE static void
-#else
-#define THREAD_PROTOTYPE static void *
-#endif
-
-THREAD_PROTOTYPE
-tclient_thread_fn (void *arg)
+static uword
+builtin_client_node_fn (vlib_main_t * vm, vlib_node_runtime_t * node,
+			vlib_frame_t * frame)
 {
   tclient_main_t *tm = &tclient_main;
+  int my_thread_index = vlib_get_thread_index ();
   vl_api_disconnect_session_t *dmp;
   session_t *sp;
-  struct timespec ts, tsrem;
   int i;
-  int try_tx, try_rx;
-  u32 *session_indices = 0;
-  clib_time_t ttime;
-  f64 before, after;
-  u64 rx_total;
+  int delete_session;
+  u32 *connection_indices;
 
-  clib_time_init (&ttime);
+  connection_indices = tm->connection_index_by_thread[my_thread_index];
 
-  /* stats thread wants no signals. */
-  {
-    sigset_t s;
-    sigfillset (&s);
-    pthread_sigmask (SIG_SETMASK, &s, 0);
-  }
+  if (tm->run_test == 0 || vec_len (connection_indices) == 0)
+    return 0;
 
-  clib_per_cpu_mheaps[vlib_get_thread_index ()] = clib_per_cpu_mheaps[0];
-
-  vec_validate (session_indices, 0);
-  vec_reset_length (session_indices);
-
-  while (1)
+  for (i = 0; i < vec_len (connection_indices); i++)
     {
-      /* Wait until we're told to get busy */
-      while (tm->run_test == 0
-	     || (tm->ready_connections != tm->expected_connections))
+      delete_session = 1;
+
+      sp = pool_elt_at_index (tm->sessions, connection_indices[i]);
+
+      if (sp->bytes_to_send > 0)
 	{
-	  ts.tv_sec = 0;
-	  ts.tv_nsec = 100000000;
-	  while (nanosleep (&ts, &tsrem) < 0)
-	    ts = tsrem;
+	  send_test_chunk (tm, sp);
+	  delete_session = 0;
 	}
-      tm->run_test = 0;
-      rx_total = 0;
-
-      clib_warning ("Start test...");
-
-      before = clib_time_now (&ttime);
-
-      do
+      if (sp->bytes_to_receive > 0)
 	{
-	  do
+	  receive_test_chunk (tm, sp);
+	  delete_session = 0;
+	}
+      if (PREDICT_FALSE (delete_session == 1))
+	{
+	  __sync_fetch_and_add (&tm->rx_total, sp->bytes_received);
+	  dmp = vl_msg_api_alloc_as_if_client (sizeof (*dmp));
+	  memset (dmp, 0, sizeof (*dmp));
+	  dmp->_vl_msg_id = ntohs (VL_API_DISCONNECT_SESSION);
+	  dmp->client_index = tm->my_client_index;
+	  dmp->handle = sp->vpp_session_handle;
+	  vl_msg_api_send_shmem (tm->vl_input_queue, (u8 *) & dmp);
+	  vec_delete (connection_indices, 1, i);
+	  tm->connection_index_by_thread[my_thread_index] =
+	    connection_indices;
+	  __sync_fetch_and_add (&tm->ready_connections, -1);
+
+	  /* Kick the debug CLI process */
+	  if (tm->ready_connections == 0)
 	    {
-	      try_tx = try_rx = 0;
-
-	      /* *INDENT-OFF* */
-	      pool_foreach (sp, tm->sessions,
-              ({
-                if (sp->bytes_to_send > 0)
-                  {
-                    send_test_chunk (tm, sp);
-                    try_tx = 1;
-                  }
-	      }));
-	      pool_foreach (sp, tm->sessions,
-              ({
-		if (sp->bytes_to_receive > 0)
-                  {
-                    receive_test_chunk (tm, sp);
-                    try_rx = 1;
-                  }
-                else
-                  {
-                    /* Session is complete */
-                    vec_add1 (session_indices, sp - tm->sessions);
-                  }
-              }));
-              /* Terminate any completed sessions */
-              if (PREDICT_FALSE (_vec_len(session_indices) != 0))
-                {
-                  for (i = 0; i < _vec_len (session_indices); i++)
-                    {
-                      sp = pool_elt_at_index (tm->sessions, session_indices[i]);
-                      rx_total += sp->bytes_received;
-                      dmp = vl_msg_api_alloc_as_if_client (sizeof (*dmp));
-                      memset (dmp, 0, sizeof (*dmp));
-                      dmp->_vl_msg_id = ntohs (VL_API_DISCONNECT_SESSION);
-                      dmp->client_index = tm->my_client_index;
-                      dmp->handle = sp->vpp_session_handle;
-                      vl_msg_api_send_shmem (tm->vl_input_queue, (u8 *) & dmp);
-                      pool_put (tm->sessions, sp);
-                    }
-                  _vec_len(session_indices) = 0;
-                }
-	      /* *INDENT-ON* */
+	      tm->test_end_time = vlib_time_now (vm);
+	      vlib_process_signal_event (vm, tm->cli_node_index,
+					 2, 0 /* data */ );
 	    }
-	  while (try_tx || try_rx);
 	}
-      while (0);
-      after = clib_time_now (&ttime);
-
-      clib_warning ("Test complete %lld bytes in %.2f secs",
-		    rx_total, (after - before));
-      if ((after - before) != 0.0)
-	{
-	  clib_warning ("%.2f bytes/second full-duplex",
-			((f64) rx_total) / (after - before));
-	  clib_warning ("%.4f gbit/second full-duplex",
-			(((f64) rx_total * 8.0) / (after - before)) / 1e9);
-	}
-
-      if (pool_elts (tm->sessions))
-	clib_warning ("BUG: %d active sessions remain...",
-		      pool_elts (tm->sessions));
     }
-  while (0);
-  /* NOTREACHED */
-#if TCP_BUILTIN_CLIENT_PTHREAD
   return 0;
-#endif
 }
 
+/* *INDENT-OFF* */
+VLIB_REGISTER_NODE (builtin_client_node) =
+{
+  .function = builtin_client_node_fn,
+  .name = "builtin-tcp-client",
+  .type = VLIB_NODE_TYPE_INPUT,
+  .state = VLIB_NODE_STATE_DISABLED,
+};
+/* *INDENT-ON* */
+
+
 /* So we don't get "no handler for... " msgs */
 static void
 vl_api_memclnt_create_reply_t_handler (vl_api_memclnt_create_reply_t * mp)
@@ -299,6 +242,7 @@
   session_t *session;
   u32 session_index;
   i32 retval = /* clib_net_to_host_u32 ( */ mp->retval /*) */ ;
+  int i;
 
   if (retval < 0)
     {
@@ -332,7 +276,29 @@
   /* Add it to the session lookup table */
   hash_set (tm->session_index_by_vpp_handles, mp->handle, session_index);
 
-  tm->ready_connections++;
+  if (tm->ready_connections == tm->expected_connections - 1)
+    {
+      vlib_thread_main_t *thread_main = vlib_get_thread_main ();
+      int thread_index;
+
+      thread_index = 0;
+      for (i = 0; i < pool_elts (tm->sessions); i++)
+	{
+	  vec_add1 (tm->connection_index_by_thread[thread_index], i);
+	  thread_index++;
+	  if (thread_index == thread_main->n_vlib_mains)
+	    thread_index = 0;
+	}
+    }
+  __sync_fetch_and_add (&tm->ready_connections, 1);
+  if (tm->ready_connections == tm->expected_connections)
+    {
+      tm->run_test = 1;
+      tm->test_start_time = vlib_time_now (tm->vlib_main);
+      /* Signal the CLI process that the action is starting... */
+      vlib_process_signal_event (tm->vlib_main, tm->cli_node_index,
+				 1, 0 /* data */ );
+    }
 }
 
 static int
@@ -414,6 +380,7 @@
 tcp_test_clients_init (vlib_main_t * vm)
 {
   tclient_main_t *tm = &tclient_main;
+  vlib_thread_main_t *thread_main = vlib_get_thread_main ();
   int i;
 
   tclient_api_hookup (vm);
@@ -429,6 +396,46 @@
   vec_validate (tm->rx_buf, vec_len (tm->connect_test_data) - 1);
 
   tm->is_init = 1;
+  tm->vlib_main = vm;
+
+  vec_validate (tm->connection_index_by_thread, thread_main->n_vlib_mains);
+  return 0;
+}
+
+static int
+builtin_session_connected_callback (u32 app_index, u32 api_context,
+				    stream_session_t * s, u8 is_fail)
+{
+  vl_api_connect_uri_reply_t _m, *mp = &_m;
+  unix_shared_memory_queue_t *q;
+  application_t *app;
+  unix_shared_memory_queue_t *vpp_queue;
+
+  app = application_get (app_index);
+  q = vl_api_client_index_to_input_queue (app->api_client_index);
+
+  if (!q)
+    return -1;
+
+  memset (mp, 0, sizeof (*mp));
+  mp->_vl_msg_id = clib_host_to_net_u16 (VL_API_CONNECT_URI_REPLY);
+  mp->context = api_context;
+  if (!is_fail)
+    {
+      vpp_queue = session_manager_get_vpp_event_queue (s->thread_index);
+      mp->server_rx_fifo = pointer_to_uword (s->server_rx_fifo);
+      mp->server_tx_fifo = pointer_to_uword (s->server_tx_fifo);
+      mp->handle = stream_session_handle (s);
+      mp->vpp_event_queue_address = pointer_to_uword (vpp_queue);
+      mp->retval = 0;
+      s->session_state = SESSION_STATE_READY;
+    }
+  else
+    {
+      mp->retval = clib_host_to_net_u32 (VNET_API_ERROR_SESSION_CONNECT_FAIL);
+    }
+
+  vl_api_connect_uri_reply_t_handler (mp);
 
   return 0;
 }
@@ -461,7 +468,7 @@
 static session_cb_vft_t builtin_clients =
   {
     .session_reset_callback = builtin_session_reset_callback,
-    .session_connected_callback = send_session_connected_callback,
+    .session_connected_callback = builtin_session_connected_callback,
     .session_accept_callback = builtin_session_create_callback,
     .session_disconnect_callback = builtin_session_disconnect_callback,
     .builtin_server_rx_callback = builtin_server_rx_callback
@@ -502,11 +509,16 @@
 			     vlib_cli_command_t * cmd)
 {
   tclient_main_t *tm = &tclient_main;
+  vlib_thread_main_t *thread_main = vlib_get_thread_main ();
+  uword *event_data = 0;
+  uword event_type;
   u8 *connect_uri = (u8 *) "tcp://6.0.1.1/1234";
   u8 *uri;
   u32 n_clients = 1;
   int i;
   u64 tmp;
+  f64 cli_timeout = 20.0;
+  f64 delta;
 
   tm->bytes_to_send = 8192;
   vec_free (tm->connect_uri);
@@ -523,6 +535,8 @@
 	;
       else if (unformat (input, "uri %s", &tm->connect_uri))
 	;
+      else if (unformat (input, "cli-timeout %f", &cli_timeout))
+	;
       else
 	return clib_error_return (0, "unknown input `%U'",
 				  format_unformat_error, input);
@@ -536,6 +550,7 @@
 
   tm->ready_connections = 0;
   tm->expected_connections = n_clients;
+  tm->rx_total = 0;
 
   uri = connect_uri;
   if (tm->connect_uri)
@@ -556,41 +571,100 @@
     }
 #endif
   vnet_session_enable_disable (vm, 1 /* turn on TCP, etc. */ );
-  attach_builtin_test_clients ();
+  if (tm->test_client_attached == 0)
+    attach_builtin_test_clients ();
+  tm->test_client_attached = 1;
 
-  /* Fire off connect requests, in something approaching a normal manner */
+  /* Turn on the builtin client input nodes */
+  for (i = 0; i < thread_main->n_vlib_mains; i++)
+    vlib_node_set_state (vlib_mains[i], builtin_client_node.index,
+			 VLIB_NODE_STATE_POLLING);
+
+  tm->cli_node_index = vlib_get_current_process (vm)->node_runtime.node_index;
+
+  /* Fire off connect requests */
   for (i = 0; i < n_clients; i++)
     {
-      vl_api_connect_uri_t *cmp;
-      cmp = vl_msg_api_alloc_as_if_client (sizeof (*cmp));
+      vl_api_connect_uri_t _cmp, *cmp = &_cmp;
+      void vl_api_connect_uri_t_handler (vl_api_connect_uri_t * cmp);
+
       memset (cmp, 0, sizeof (*cmp));
 
       cmp->_vl_msg_id = ntohs (VL_API_CONNECT_URI);
       cmp->client_index = tm->my_client_index;
       cmp->context = ntohl (0xfeedface);
       memcpy (cmp->uri, uri, strlen ((char *) uri) + 1);
-      vl_msg_api_send_shmem (tm->vl_input_queue, (u8 *) & cmp);
+
+      vl_api_connect_uri_t_handler (cmp);
+      /* Crude pacing for call setups, 100k/sec  */
+      vlib_process_suspend (vm, 10e-6);
     }
 
-  tm->run_test = 1;
+  /* Park until the sessions come up, or ten seconds elapse... */
+  vlib_process_wait_for_event_or_clock (vm, 10.0 /* timeout, seconds */ );
+  event_type = vlib_process_get_events (vm, &event_data);
+
+  switch (event_type)
+    {
+    case ~0:
+      vlib_cli_output (vm, "Timeout with only %d sessions active...",
+		       tm->ready_connections);
+      goto cleanup;
+
+    case 1:
+      vlib_cli_output (vm, "Test started at %.6f", tm->test_start_time);
+      break;
+
+    default:
+      vlib_cli_output (vm, "unexpected event(1): %d", event_type);
+      goto cleanup;
+    }
+
+  /* Now wait for the sessions to finish... */
+  vlib_process_wait_for_event_or_clock (vm, cli_timeout);
+  event_type = vlib_process_get_events (vm, &event_data);
+
+  switch (event_type)
+    {
+    case ~0:
+      vlib_cli_output (vm, "Timeout with %d sessions still active...",
+		       tm->ready_connections);
+      goto cleanup;
+
+    case 2:
+      vlib_cli_output (vm, "Test finished at %.6f", tm->test_end_time);
+      break;
+
+    default:
+      vlib_cli_output (vm, "unexpected event(2): %d", event_type);
+      goto cleanup;
+    }
+
+  delta = tm->test_end_time - tm->test_start_time;
+
+  if (delta != 0.0)
+    {
+      vlib_cli_output (vm,
+		       "%lld bytes (%lld mbytes, %lld gbytes) in %.2f seconds",
+		       tm->rx_total, tm->rx_total / (1ULL << 20),
+		       tm->rx_total / (1ULL << 30), delta);
+      vlib_cli_output (vm, "%.2f bytes/second full-duplex",
+		       ((f64) tm->rx_total) / (delta));
+      vlib_cli_output (vm, "%.4f gbit/second full-duplex",
+		       (((f64) tm->rx_total * 8.0) / delta / 1e9));
+    }
+  else
+    vlib_cli_output (vm, "zero delta-t?");
+
+cleanup:
+  pool_free (tm->sessions);
+  for (i = 0; i < vec_len (tm->connection_index_by_thread); i++)
+    vec_reset_length (tm->connection_index_by_thread[i]);
 
   return 0;
 }
 
 /* *INDENT-OFF* */
-#if TCP_BUILTIN_CLIENT_VPP_THREAD
-VLIB_REGISTER_THREAD (builtin_client_reg, static) =
-{
-  .name = "tcp-builtin-client",
-  .function = tclient_thread_fn,
-  .fixed_count = 1,
-  .count = 1,
-  .no_data_structure_clone = 1,
-};
-#endif
-/* *INDENT-ON* */
-
-/* *INDENT-OFF* */
 VLIB_CLI_COMMAND (test_clients_command, static) =
 {
   .path = "test tcp clients",
diff --git a/src/vnet/tcp/builtin_client.h b/src/vnet/tcp/builtin_client.h
index 57d112e..d5d79e5 100644
--- a/src/vnet/tcp/builtin_client.h
+++ b/src/vnet/tcp/builtin_client.h
@@ -83,14 +83,18 @@
 
   pid_t my_pid;
 
-  /* For deadman timers */
-  clib_time_t clib_time;
+  f64 test_start_time;
+  f64 test_end_time;
 
-  /* Connection counts */
   u32 expected_connections;
+  u32 **connection_index_by_thread;
   volatile u32 ready_connections;
+  volatile u32 finished_connections;
 
-  /* Signal variables */
+  volatile u64 rx_total;
+  u32 cli_node_index;
+
+  /* Signal variable */
   volatile int run_test;
 
   /* Bytes to send */
@@ -107,6 +111,7 @@
   u8 test_return_packets;
 
   u8 is_init;
+  u8 test_client_attached;
 
   u32 node_index;
 
diff --git a/src/vnet/tcp/builtin_http_server.c b/src/vnet/tcp/builtin_http_server.c
index 763a46e..8b4801c 100644
--- a/src/vnet/tcp/builtin_http_server.c
+++ b/src/vnet/tcp/builtin_http_server.c
@@ -513,6 +513,7 @@
   a->options[SESSION_OPTIONS_RX_FIFO_SIZE] = 8 << 10;
   a->options[SESSION_OPTIONS_TX_FIFO_SIZE] = 32 << 10;
   a->options[APP_OPTIONS_FLAGS] = APP_OPTIONS_FLAGS_BUILTIN_APP;
+  a->options[APP_OPTIONS_PREALLOC_FIFO_PAIRS] = 16;
   a->segment_name = segment_name;
   a->segment_name_length = ARRAY_LEN (segment_name);
 
diff --git a/src/vnet/tcp/builtin_server.c b/src/vnet/tcp/builtin_server.c
index 64fc4a7..4f0e211 100644
--- a/src/vnet/tcp/builtin_server.c
+++ b/src/vnet/tcp/builtin_server.c
@@ -62,7 +62,6 @@
 builtin_session_accept_callback (stream_session_t * s)
 {
   builtin_server_main_t *bsm = &builtin_server_main;
-  clib_warning ("called...");
 
   bsm->vpp_queue[s->thread_index] =
     session_manager_get_vpp_event_queue (s->thread_index);
@@ -76,7 +75,6 @@
 {
   builtin_server_main_t *bsm = &builtin_server_main;
   vnet_disconnect_args_t _a, *a = &_a;
-  clib_warning ("called...");
 
   a->handle = stream_session_handle (s);
   a->app_index = bsm->app_index;
@@ -280,10 +278,11 @@
   a->api_client_index = bsm->my_client_index;
   a->session_cb_vft = &builtin_session_cb_vft;
   a->options = options;
-  a->options[SESSION_OPTIONS_SEGMENT_SIZE] = 128 << 20;
-  a->options[SESSION_OPTIONS_RX_FIFO_SIZE] = 1 << 16;
-  a->options[SESSION_OPTIONS_TX_FIFO_SIZE] = 1 << 16;
+  a->options[SESSION_OPTIONS_SEGMENT_SIZE] = 512 << 20;
+  a->options[SESSION_OPTIONS_RX_FIFO_SIZE] = 64 << 10;
+  a->options[SESSION_OPTIONS_TX_FIFO_SIZE] = 64 << 10;
   a->options[APP_OPTIONS_FLAGS] = APP_OPTIONS_FLAGS_BUILTIN_APP;
+  a->options[APP_OPTIONS_PREALLOC_FIFO_PAIRS] = 8192;
   a->segment_name = segment_name;
   a->segment_name_length = ARRAY_LEN (segment_name);
 
diff --git a/src/vnet/udp/builtin_server.c b/src/vnet/udp/builtin_server.c
index 18684d5..7dd0367 100644
--- a/src/vnet/udp/builtin_server.c
+++ b/src/vnet/udp/builtin_server.c
@@ -111,6 +111,7 @@
   options[SESSION_OPTIONS_ACCEPT_COOKIE] = 0x12345678;
   options[SESSION_OPTIONS_SEGMENT_SIZE] = (2 << 30);	/*$$$$ config / arg */
   options[APP_OPTIONS_FLAGS] = APP_OPTIONS_FLAGS_BUILTIN_APP;
+  options[APP_OPTIONS_PREALLOC_FIFO_PAIRS] = 1024;
 
   a->options = options;