Use thread local storage for thread index

This patch deprecates stack-based thread identification,
Also removes requirement that thread stacks are adjacent.

Finally, possibly annoying for some folks, it renames
all occurences of cpu_index and cpu_number with thread
index. Using word "cpu" is misleading here as thread can
be migrated ti different CPU, and also it is not related
to linux cpu index.

Change-Id: I68cdaf661e701d2336fc953dcb9978d10a70f7c1
Signed-off-by: Damjan Marion <damarion@cisco.com>
diff --git a/src/vlib/buffer.c b/src/vlib/buffer.c
index a517a59..be3b41e 100644
--- a/src/vlib/buffer.c
+++ b/src/vlib/buffer.c
@@ -299,7 +299,7 @@
   if (CLIB_DEBUG == 0)
     return;
 
-  ASSERT (os_get_cpu_number () == 0);
+  ASSERT (vlib_get_thread_index () == 0);
 
   /* smp disaster check */
   if (vec_len (vlib_mains) > 1)
@@ -355,7 +355,7 @@
   vlib_buffer_free_list_t *f;
   int i;
 
-  ASSERT (os_get_cpu_number () == 0);
+  ASSERT (vlib_get_thread_index () == 0);
 
   if (!is_default && pool_elts (bm->buffer_free_list_pool) == 0)
     {
@@ -474,7 +474,7 @@
   u32 merge_index;
   int i;
 
-  ASSERT (os_get_cpu_number () == 0);
+  ASSERT (vlib_get_thread_index () == 0);
 
   f = vlib_buffer_get_free_list (vm, free_list_index);
 
diff --git a/src/vlib/buffer_funcs.h b/src/vlib/buffer_funcs.h
index 394c336..328660a 100644
--- a/src/vlib/buffer_funcs.h
+++ b/src/vlib/buffer_funcs.h
@@ -209,7 +209,7 @@
 vlib_buffer_is_known (vlib_main_t * vm, u32 buffer_index)
 {
   vlib_buffer_main_t *bm = vm->buffer_main;
-  ASSERT (os_get_cpu_number () == 0);
+  ASSERT (vlib_get_thread_index () == 0);
 
   uword *p = hash_get (bm->buffer_known_hash, buffer_index);
   return p ? p[0] : VLIB_BUFFER_UNKNOWN;
@@ -221,7 +221,7 @@
 			     vlib_buffer_known_state_t state)
 {
   vlib_buffer_main_t *bm = vm->buffer_main;
-  ASSERT (os_get_cpu_number () == 0);
+  ASSERT (vlib_get_thread_index () == 0);
   hash_set (bm->buffer_known_hash, buffer_index, state);
 }
 
diff --git a/src/vlib/cli.c b/src/vlib/cli.c
index f853f65..3cc9507 100644
--- a/src/vlib/cli.c
+++ b/src/vlib/cli.c
@@ -709,7 +709,7 @@
     {
         /* *INDENT-OFF* */
         foreach_vlib_main({
-          heap = clib_per_cpu_mheaps[this_vlib_main->cpu_index];
+          heap = clib_per_cpu_mheaps[this_vlib_main->thread_index];
           mheap = mheap_header(heap);
           mheap->flags |= MHEAP_FLAG_VALIDATE;
           // Turn off small object cache because it delays detection of errors
@@ -722,7 +722,7 @@
     {
         /* *INDENT-OFF* */
         foreach_vlib_main({
-          heap = clib_per_cpu_mheaps[this_vlib_main->cpu_index];
+          heap = clib_per_cpu_mheaps[this_vlib_main->thread_index];
           mheap = mheap_header(heap);
           mheap->flags &= ~MHEAP_FLAG_VALIDATE;
           mheap->flags |= MHEAP_FLAG_SMALL_OBJECT_CACHE;
@@ -733,7 +733,7 @@
     {
         /* *INDENT-OFF* */
         foreach_vlib_main({
-          heap = clib_per_cpu_mheaps[this_vlib_main->cpu_index];
+          heap = clib_per_cpu_mheaps[this_vlib_main->thread_index];
           mheap = mheap_header(heap);
           mheap_validate(heap);
         });
diff --git a/src/vlib/counter.h b/src/vlib/counter.h
index 17a8521..60e2055 100644
--- a/src/vlib/counter.h
+++ b/src/vlib/counter.h
@@ -70,17 +70,17 @@
 
 /** Increment a simple counter
     @param cm - (vlib_simple_counter_main_t *) simple counter main pointer
-    @param cpu_index - (u32) the current cpu index
+    @param thread_index - (u32) the current cpu index
     @param index - (u32) index of the counter to increment
     @param increment - (u64) quantitiy to add to the counter
 */
 always_inline void
 vlib_increment_simple_counter (vlib_simple_counter_main_t * cm,
-			       u32 cpu_index, u32 index, u64 increment)
+			       u32 thread_index, u32 index, u64 increment)
 {
   counter_t *my_counters;
 
-  my_counters = cm->counters[cpu_index];
+  my_counters = cm->counters[thread_index];
   my_counters[index] += increment;
 }
 
@@ -201,7 +201,7 @@
 
 /** Increment a combined counter
     @param cm - (vlib_combined_counter_main_t *) comined counter main pointer
-    @param cpu_index - (u32) the current cpu index
+    @param thread_index - (u32) the current cpu index
     @param index - (u32) index of the counter to increment
     @param packet_increment - (u64) number of packets to add to the counter
     @param byte_increment - (u64) number of bytes to add to the counter
@@ -209,13 +209,13 @@
 
 always_inline void
 vlib_increment_combined_counter (vlib_combined_counter_main_t * cm,
-				 u32 cpu_index,
+				 u32 thread_index,
 				 u32 index, u64 n_packets, u64 n_bytes)
 {
   vlib_counter_t *my_counters;
 
   /* Use this CPU's counter array */
-  my_counters = cm->counters[cpu_index];
+  my_counters = cm->counters[thread_index];
 
   my_counters[index].packets += n_packets;
   my_counters[index].bytes += n_bytes;
@@ -224,14 +224,14 @@
 /** Pre-fetch a per-thread combined counter for the given object index */
 always_inline void
 vlib_prefetch_combined_counter (const vlib_combined_counter_main_t * cm,
-				u32 cpu_index, u32 index)
+				u32 thread_index, u32 index)
 {
   vlib_counter_t *cpu_counters;
 
   /*
    * This CPU's index is assumed to already be in cache
    */
-  cpu_counters = cm->counters[cpu_index];
+  cpu_counters = cm->counters[thread_index];
   CLIB_PREFETCH (cpu_counters + index, CLIB_CACHE_LINE_BYTES, STORE);
 }
 
diff --git a/src/vlib/error.c b/src/vlib/error.c
index a2c2317..e4ed4ee 100644
--- a/src/vlib/error.c
+++ b/src/vlib/error.c
@@ -149,7 +149,7 @@
   vlib_node_t *n = vlib_get_node (vm, node_index);
   uword l;
 
-  ASSERT (os_get_cpu_number () == 0);
+  ASSERT (vlib_get_thread_index () == 0);
 
   /* Free up any previous error strings. */
   if (n->n_errors > 0)
diff --git a/src/vlib/global_funcs.h b/src/vlib/global_funcs.h
index f51ec38..9dd01fb 100644
--- a/src/vlib/global_funcs.h
+++ b/src/vlib/global_funcs.h
@@ -23,7 +23,7 @@
 vlib_get_main (void)
 {
   vlib_main_t *vm;
-  vm = vlib_mains[os_get_cpu_number ()];
+  vm = vlib_mains[vlib_get_thread_index ()];
   ASSERT (vm);
   return vm;
 }
diff --git a/src/vlib/main.c b/src/vlib/main.c
index b22203f..422d3e2 100644
--- a/src/vlib/main.c
+++ b/src/vlib/main.c
@@ -136,18 +136,18 @@
   else
     {
       f = clib_mem_alloc_aligned_no_fail (n, VLIB_FRAME_ALIGN);
-      f->cpu_index = vm->cpu_index;
+      f->thread_index = vm->thread_index;
       fi = vlib_frame_index_no_check (vm, f);
     }
 
   /* Poison frame when debugging. */
   if (CLIB_DEBUG > 0)
     {
-      u32 save_cpu_index = f->cpu_index;
+      u32 save_thread_index = f->thread_index;
 
       memset (f, 0xfe, n);
 
-      f->cpu_index = save_cpu_index;
+      f->thread_index = save_thread_index;
     }
 
   /* Insert magic number. */
@@ -517,7 +517,7 @@
 	   * a dangling frame reference. Each thread has its own copy of
 	   * the next_frames vector.
 	   */
-	  if (0 && r->cpu_index != next_runtime->cpu_index)
+	  if (0 && r->thread_index != next_runtime->thread_index)
 	    {
 	      nf->frame_index = ~0;
 	      nf->flags &= ~(VLIB_FRAME_PENDING | VLIB_FRAME_IS_ALLOCATED);
@@ -866,7 +866,7 @@
 				  : evm->node_call_elog_event_types,
 				  node_index),
 		/* track */
-		(vm->cpu_index ? &vlib_worker_threads[vm->cpu_index].
+		(vm->thread_index ? &vlib_worker_threads[vm->thread_index].
 		 elog_track : &em->default_track),
 		/* data to log */ n_vectors);
 }
@@ -963,7 +963,7 @@
 
   vm->cpu_time_last_node_dispatch = last_time_stamp;
 
-  if (1 /* || vm->cpu_index == node->cpu_index */ )
+  if (1 /* || vm->thread_index == node->thread_index */ )
     {
       vlib_main_t *stat_vm;
 
@@ -1029,7 +1029,7 @@
 	  {
 	    u32 node_name, vector_length, is_polling;
 	  } *ed;
-	  vlib_worker_thread_t *w = vlib_worker_threads + vm->cpu_index;
+	  vlib_worker_thread_t *w = vlib_worker_threads + vm->thread_index;
 #endif
 
 	  if ((dispatch_state == VLIB_NODE_STATE_INTERRUPT
diff --git a/src/vlib/main.h b/src/vlib/main.h
index 0197b4f..329bf07 100644
--- a/src/vlib/main.h
+++ b/src/vlib/main.h
@@ -156,7 +156,7 @@
   uword *init_functions_called;
 
   /* to compare with node runtime */
-  u32 cpu_index;
+  u32 thread_index;
 
   void **mbuf_alloc_list;
 
diff --git a/src/vlib/node.c b/src/vlib/node.c
index dc0a4de..bbd3a42 100644
--- a/src/vlib/node.c
+++ b/src/vlib/node.c
@@ -99,7 +99,7 @@
   vlib_pending_frame_t *pf;
   i32 i, j, n_insert;
 
-  ASSERT (os_get_cpu_number () == 0);
+  ASSERT (vlib_get_thread_index () == 0);
 
   vlib_worker_thread_barrier_sync (vm);
 
diff --git a/src/vlib/node.h b/src/vlib/node.h
index fc7e7da..1e2f4c3 100644
--- a/src/vlib/node.h
+++ b/src/vlib/node.h
@@ -344,8 +344,8 @@
   /* Number of vector elements currently in frame. */
   u16 n_vectors;
 
-  /* Owner cpuid / heap id */
-  u16 cpu_index;
+  /* Owner thread / heap id */
+  u16 thread_index;
 
   /* Scalar and vector arguments to next node. */
   u8 arguments[0];
@@ -459,7 +459,7 @@
 					  zero before first run of this
 					  node. */
 
-  u16 cpu_index;			/**< CPU this node runs on */
+  u16 thread_index;			/**< thread this node runs on */
 
   u8 runtime_data[0];			/**< Function dependent
 					  node-runtime data. This data is
diff --git a/src/vlib/node_funcs.h b/src/vlib/node_funcs.h
index 1f7d94e..54e3687 100644
--- a/src/vlib/node_funcs.h
+++ b/src/vlib/node_funcs.h
@@ -201,9 +201,9 @@
 vlib_get_frame_no_check (vlib_main_t * vm, uword frame_index)
 {
   vlib_frame_t *f;
-  u32 cpu_index = frame_index & VLIB_CPU_MASK;
+  u32 thread_index = frame_index & VLIB_CPU_MASK;
   u32 offset = frame_index & VLIB_OFFSET_MASK;
-  vm = vlib_mains[cpu_index];
+  vm = vlib_mains[thread_index];
   f = vm->heap_base + offset;
   return f;
 }
@@ -215,10 +215,10 @@
 
   ASSERT (((uword) f & VLIB_CPU_MASK) == 0);
 
-  vm = vlib_mains[f->cpu_index];
+  vm = vlib_mains[f->thread_index];
 
   i = ((u8 *) f - (u8 *) vm->heap_base);
-  return i | f->cpu_index;
+  return i | f->thread_index;
 }
 
 always_inline vlib_frame_t *
diff --git a/src/vlib/threads.c b/src/vlib/threads.c
index ef3a24d..4a111f8 100644
--- a/src/vlib/threads.c
+++ b/src/vlib/threads.c
@@ -35,27 +35,12 @@
 vlib_worker_thread_t *vlib_worker_threads;
 vlib_thread_main_t vlib_thread_main;
 
+__thread uword vlib_thread_index = 0;
+
 uword
 os_get_cpu_number (void)
 {
-  void *sp;
-  uword n;
-  u32 len;
-
-  len = vec_len (vlib_thread_stacks);
-  if (len == 0)
-    return 0;
-
-  /* Get any old stack address. */
-  sp = &sp;
-
-  n = ((uword) sp - (uword) vlib_thread_stacks[0])
-    >> VLIB_LOG2_THREAD_STACK_SIZE;
-
-  /* "processes" have their own stacks, and they always run in thread 0 */
-  n = n >= len ? 0 : n;
-
-  return n;
+  return vlib_thread_index;
 }
 
 uword
@@ -275,21 +260,6 @@
   return 0;
 }
 
-vlib_worker_thread_t *
-vlib_alloc_thread (vlib_main_t * vm)
-{
-  vlib_worker_thread_t *w;
-
-  if (vec_len (vlib_worker_threads) >= vec_len (vlib_thread_stacks))
-    {
-      clib_warning ("out of worker threads... Quitting...");
-      exit (1);
-    }
-  vec_add2 (vlib_worker_threads, w, 1);
-  w->thread_stack = vlib_thread_stacks[w - vlib_worker_threads];
-  return w;
-}
-
 vlib_frame_queue_t *
 vlib_frame_queue_alloc (int nelts)
 {
@@ -427,7 +397,7 @@
       f64 b4 = vlib_time_now_ticks (vm, before);
       vlib_worker_thread_barrier_check (vm, b4);
       /* Bad idea. Dequeue -> enqueue -> dequeue -> trouble */
-      // vlib_frame_queue_dequeue (vm->cpu_index, vm, nm);
+      // vlib_frame_queue_dequeue (vm->thread_index, vm, nm);
     }
 
   elt = fq->elts + (new_tail & (fq->nelts - 1));
@@ -497,6 +467,8 @@
   w->lwp = syscall (SYS_gettid);
   w->thread_id = pthread_self ();
 
+  vlib_thread_index = w - vlib_worker_threads;
+
   rv = (void *) clib_calljmp
     ((uword (*)(uword)) w->thread_function,
      (uword) arg, w->thread_stack + VLIB_THREAD_STACK_SIZE);
@@ -610,7 +582,9 @@
 		  mheap_alloc (0 /* use VM */ , tr->mheap_size);
 	      else
 		w->thread_mheap = main_heap;
-	      w->thread_stack = vlib_thread_stacks[w - vlib_worker_threads];
+
+	      w->thread_stack =
+		vlib_thread_stack_init (w - vlib_worker_threads);
 	      w->thread_function = tr->function;
 	      w->thread_function_arg = w;
 	      w->instance_id = k;
@@ -630,7 +604,7 @@
 	      vm_clone = clib_mem_alloc (sizeof (*vm_clone));
 	      clib_memcpy (vm_clone, vlib_mains[0], sizeof (*vm_clone));
 
-	      vm_clone->cpu_index = worker_thread_index;
+	      vm_clone->thread_index = worker_thread_index;
 	      vm_clone->heap_base = w->thread_mheap;
 	      vm_clone->mbuf_alloc_list = 0;
 	      vm_clone->init_functions_called =
@@ -679,7 +653,7 @@
 	      vec_foreach (rt, nm_clone->nodes_by_type[VLIB_NODE_TYPE_INPUT])
 	      {
 		vlib_node_t *n = vlib_get_node (vm, rt->node_index);
-		rt->cpu_index = vm_clone->cpu_index;
+		rt->thread_index = vm_clone->thread_index;
 		/* copy initial runtime_data from node */
 		if (n->runtime_data && n->runtime_data_bytes > 0)
 		  clib_memcpy (rt->runtime_data, n->runtime_data,
@@ -692,7 +666,7 @@
 	      vec_foreach (rt, nm_clone->nodes_by_type[VLIB_NODE_TYPE_INPUT])
 	      {
 		vlib_node_t *n = vlib_get_node (vm, rt->node_index);
-		rt->cpu_index = vm_clone->cpu_index;
+		rt->thread_index = vm_clone->thread_index;
 		/* copy initial runtime_data from node */
 		if (n->runtime_data && n->runtime_data_bytes > 0)
 		  clib_memcpy (rt->runtime_data, n->runtime_data,
@@ -756,7 +730,8 @@
 		  mheap_alloc (0 /* use VM */ , tr->mheap_size);
 	      else
 		w->thread_mheap = main_heap;
-	      w->thread_stack = vlib_thread_stacks[w - vlib_worker_threads];
+	      w->thread_stack =
+		vlib_thread_stack_init (w - vlib_worker_threads);
 	      w->thread_function = tr->function;
 	      w->thread_function_arg = w;
 	      w->instance_id = j;
@@ -827,7 +802,7 @@
 				  uword n_calls,
 				  uword n_vectors, uword n_clocks);
 
-  ASSERT (os_get_cpu_number () == 0);
+  ASSERT (vlib_get_thread_index () == 0);
 
   if (vec_len (vlib_mains) == 1)
     return;
@@ -835,7 +810,7 @@
   vm = vlib_mains[0];
   nm = &vm->node_main;
 
-  ASSERT (os_get_cpu_number () == 0);
+  ASSERT (vlib_get_thread_index () == 0);
   ASSERT (*vlib_worker_threads->wait_at_barrier == 1);
 
   /*
@@ -955,7 +930,7 @@
       vec_foreach (rt, nm_clone->nodes_by_type[VLIB_NODE_TYPE_INTERNAL])
       {
 	vlib_node_t *n = vlib_get_node (vm, rt->node_index);
-	rt->cpu_index = vm_clone->cpu_index;
+	rt->thread_index = vm_clone->thread_index;
 	/* copy runtime_data, will be overwritten later for existing rt */
 	if (n->runtime_data && n->runtime_data_bytes > 0)
 	  clib_memcpy (rt->runtime_data, n->runtime_data,
@@ -981,7 +956,7 @@
       vec_foreach (rt, nm_clone->nodes_by_type[VLIB_NODE_TYPE_INPUT])
       {
 	vlib_node_t *n = vlib_get_node (vm, rt->node_index);
-	rt->cpu_index = vm_clone->cpu_index;
+	rt->thread_index = vm_clone->thread_index;
 	/* copy runtime_data, will be overwritten later for existing rt */
 	if (n->runtime_data && n->runtime_data_bytes > 0)
 	  clib_memcpy (rt->runtime_data, n->runtime_data,
@@ -1180,7 +1155,7 @@
   if (vlib_mains == 0)
     return;
 
-  ASSERT (os_get_cpu_number () == 0);
+  ASSERT (vlib_get_thread_index () == 0);
   vlib_worker_thread_barrier_sync (vm);
 
   switch (which)
@@ -1212,7 +1187,7 @@
 
   vlib_worker_threads[0].barrier_sync_count++;
 
-  ASSERT (os_get_cpu_number () == 0);
+  ASSERT (vlib_get_thread_index () == 0);
 
   deadline = vlib_time_now (vm) + BARRIER_SYNC_TIMEOUT;
 
@@ -1260,7 +1235,7 @@
 int
 vlib_frame_queue_dequeue (vlib_main_t * vm, vlib_frame_queue_main_t * fqm)
 {
-  u32 thread_id = vm->cpu_index;
+  u32 thread_id = vm->thread_index;
   vlib_frame_queue_t *fq = fqm->vlib_frame_queues[thread_id];
   vlib_frame_queue_elt_t *elt;
   u32 *from, *to;
@@ -1393,7 +1368,7 @@
   vlib_main_t *vm = vlib_get_main ();
   clib_error_t *e;
 
-  ASSERT (vm->cpu_index == os_get_cpu_number ());
+  ASSERT (vm->thread_index == vlib_get_thread_index ());
 
   vlib_worker_thread_init (w);
   clib_time_init (&vm->clib_time);
diff --git a/src/vlib/threads.h b/src/vlib/threads.h
index eca4fc2..101d3d4 100644
--- a/src/vlib/threads.h
+++ b/src/vlib/threads.h
@@ -153,8 +153,6 @@
 /* Called early, in thread 0's context */
 clib_error_t *vlib_thread_init (vlib_main_t * vm);
 
-vlib_worker_thread_t *vlib_alloc_thread (vlib_main_t * vm);
-
 int vlib_frame_queue_enqueue (vlib_main_t * vm, u32 node_runtime_index,
 			      u32 frame_queue_index, vlib_frame_t * frame,
 			      vlib_frame_queue_msg_type_t type);
@@ -183,12 +181,19 @@
 void vlib_worker_thread_barrier_sync (vlib_main_t * vm);
 void vlib_worker_thread_barrier_release (vlib_main_t * vm);
 
+extern __thread uword vlib_thread_index;
+static_always_inline uword
+vlib_get_thread_index (void)
+{
+  return vlib_thread_index;
+}
+
 always_inline void
 vlib_smp_unsafe_warning (void)
 {
   if (CLIB_DEBUG > 0)
     {
-      if (os_get_cpu_number ())
+      if (vlib_get_thread_index ())
 	fformat (stderr, "%s: SMP unsafe warning...\n", __FUNCTION__);
     }
 }
@@ -331,21 +336,21 @@
 }
 
 always_inline u32
-vlib_get_worker_cpu_index (u32 worker_index)
+vlib_get_worker_thread_index (u32 worker_index)
 {
   return worker_index + 1;
 }
 
 always_inline u32
-vlib_get_worker_index (u32 cpu_index)
+vlib_get_worker_index (u32 thread_index)
 {
-  return cpu_index - 1;
+  return thread_index - 1;
 }
 
 always_inline u32
 vlib_get_current_worker_index ()
 {
-  return os_get_cpu_number () - 1;
+  return vlib_get_thread_index () - 1;
 }
 
 static inline void
@@ -467,6 +472,8 @@
   return elt;
 }
 
+u8 *vlib_thread_stack_init (uword thread_index);
+
 int vlib_thread_cb_register (struct vlib_main_t *vm,
 			     vlib_thread_callbacks_t * cb);
 
diff --git a/src/vlib/unix/cj.c b/src/vlib/unix/cj.c
index 33ba163..7c1e947 100644
--- a/src/vlib/unix/cj.c
+++ b/src/vlib/unix/cj.c
@@ -48,7 +48,7 @@
 
   r = (cj_record_t *) & (cjm->records[new_tail & (cjm->num_records - 1)]);
   r->time = vlib_time_now (cjm->vlib_main);
-  r->cpu = os_get_cpu_number ();
+  r->thread_index = vlib_get_thread_index ();
   r->type = type;
   r->data[0] = pointer_to_uword (data0);
   r->data[1] = pointer_to_uword (data1);
@@ -133,7 +133,8 @@
 cj_dump_one_record (cj_record_t * r)
 {
   fprintf (stderr, "[%d]: %10.6f T%02d %llx %llx\n",
-	   r->cpu, r->time, r->type, (long long unsigned int) r->data[0],
+	   r->thread_index, r->time, r->type,
+	   (long long unsigned int) r->data[0],
 	   (long long unsigned int) r->data[1]);
 }
 
@@ -161,7 +162,7 @@
   index = (cjm->tail + 1) & (cjm->num_records - 1);
   r = &(cjm->records[index]);
 
-  if (r->cpu != (u32) ~ 0)
+  if (r->thread_index != (u32) ~ 0)
     {
       /* Yes, dump from tail + 1 to the end */
       for (i = index; i < cjm->num_records; i++)
diff --git a/src/vlib/unix/cj.h b/src/vlib/unix/cj.h
index 67626af..d0a1d46 100644
--- a/src/vlib/unix/cj.h
+++ b/src/vlib/unix/cj.h
@@ -23,7 +23,7 @@
 typedef struct
 {
   f64 time;
-  u32 cpu;
+  u32 thread_index;
   u32 type;
   u64 data[2];
 } cj_record_t;
diff --git a/src/vlib/unix/main.c b/src/vlib/unix/main.c
index 6b96cc0..db5ddd6 100644
--- a/src/vlib/unix/main.c
+++ b/src/vlib/unix/main.c
@@ -510,13 +510,28 @@
   return i;
 }
 
+u8 *
+vlib_thread_stack_init (uword thread_index)
+{
+  vec_validate (vlib_thread_stacks, thread_index);
+  vlib_thread_stacks[thread_index] = clib_mem_alloc_aligned
+    (VLIB_THREAD_STACK_SIZE, VLIB_THREAD_STACK_SIZE);
+
+  /*
+   * Disallow writes to the bottom page of the stack, to
+   * catch stack overflows.
+   */
+  if (mprotect (vlib_thread_stacks[thread_index],
+		clib_mem_get_page_size (), PROT_READ) < 0)
+    clib_unix_warning ("thread stack");
+  return vlib_thread_stacks[thread_index];
+}
+
 int
 vlib_unix_main (int argc, char *argv[])
 {
   vlib_main_t *vm = &vlib_global_main;	/* one and only time for this! */
-  vlib_thread_main_t *tm = &vlib_thread_main;
   unformat_input_t input;
-  u8 *thread_stacks;
   clib_error_t *e;
   int i;
 
@@ -548,29 +563,9 @@
     }
   unformat_free (&input);
 
-  /*
-   * allocate n x VLIB_THREAD_STACK_SIZE stacks, aligned to a
-   * VLIB_THREAD_STACK_SIZE boundary
-   * See also: os_get_cpu_number() in vlib/vlib/threads.c
-   */
-  thread_stacks = clib_mem_alloc_aligned
-    ((uword) tm->n_thread_stacks * VLIB_THREAD_STACK_SIZE,
-     VLIB_THREAD_STACK_SIZE);
+  vlib_thread_stack_init (0);
 
-  vec_validate (vlib_thread_stacks, tm->n_thread_stacks - 1);
-  for (i = 0; i < vec_len (vlib_thread_stacks); i++)
-    {
-      vlib_thread_stacks[i] = thread_stacks;
-
-      /*
-       * Disallow writes to the bottom page of the stack, to
-       * catch stack overflows.
-       */
-      if (mprotect (thread_stacks, clib_mem_get_page_size (), PROT_READ) < 0)
-	clib_unix_warning ("thread stack");
-
-      thread_stacks += VLIB_THREAD_STACK_SIZE;
-    }
+  vlib_thread_index = 0;
 
   i = clib_calljmp (thread0, (uword) vm,
 		    (void *) (vlib_thread_stacks[0] +