vlib physmem rework

This patch adds supprot support for multiple numa-aware physmem regions.

Change-Id: I5c69a6f4da33c8ee21bdb8604d52fd2886f2327e
Signed-off-by: Damjan Marion <damarion@cisco.com>
diff --git a/src/examples/vlib/main_stub.c b/src/examples/vlib/main_stub.c
index 4d74bd7..3b19c53 100644
--- a/src/examples/vlib/main_stub.c
+++ b/src/examples/vlib/main_stub.c
@@ -27,8 +27,7 @@
 {
   clib_error_t *error;
 
-  if ((error =
-       unix_physmem_init (vm, /* fail_if_physical_memory_not_present */ 0)))
+  if ((error = unix_physmem_init (vm)))
     return error;
 
   if ((error = vlib_call_init_function (vm, unix_cli_init)))
diff --git a/src/plugins/dpdk/api/dpdk_api.c b/src/plugins/dpdk/api/dpdk_api.c
index 08afdd7..97c4bc7 100755
--- a/src/plugins/dpdk/api/dpdk_api.c
+++ b/src/plugins/dpdk/api/dpdk_api.c
@@ -20,7 +20,6 @@
 
 #include <vnet/ethernet/ethernet.h>
 #include <dpdk/device/dpdk.h>
-#include <vlib/unix/physmem.h>
 #include <vlib/pci/pci.h>
 
 #include <stdio.h>
diff --git a/src/plugins/dpdk/device/init.c b/src/plugins/dpdk/device/init.c
index a795ba0..e23542f 100755
--- a/src/plugins/dpdk/device/init.c
+++ b/src/plugins/dpdk/device/init.c
@@ -17,10 +17,10 @@
 #include <vppinfra/error.h>
 #include <vppinfra/format.h>
 #include <vppinfra/bitmap.h>
+#include <vlib/unix/unix.h>
 
 #include <vnet/ethernet/ethernet.h>
 #include <dpdk/device/dpdk.h>
-#include <vlib/unix/physmem.h>
 #include <vlib/pci/pci.h>
 
 #include <stdio.h>
@@ -1026,21 +1026,28 @@
       clib_bitmap_foreach (c, tm->cpu_socket_bitmap, (
         {
 	  int pages_avail, page_size, mem;
+	  clib_error_t  *e = 0;
 
 	  vec_validate(mem_by_socket, c);
 	  mem = mem_by_socket[c];
 
 	  page_size = 1024;
-	  pages_avail = vlib_sysfs_get_free_hugepages(c, page_size * 1024);
+	  e = vlib_sysfs_get_free_hugepages(c, page_size * 1024, &pages_avail);
 
-	  if (pages_avail < 0 || page_size * pages_avail < mem)
+	  if (e != 0 || pages_avail < 0 || page_size * pages_avail < mem)
 	    use_1g = 0;
 
-	  page_size = 2;
-	  pages_avail = vlib_sysfs_get_free_hugepages(c, page_size * 1024);
+	  if (e)
+	   clib_error_free (e);
 
-	  if (pages_avail < 0 || page_size * pages_avail < mem)
+	  page_size = 2;
+	  e = vlib_sysfs_get_free_hugepages(c, page_size * 1024, &pages_avail);
+
+	  if (e != 0 || pages_avail < 0 || page_size * pages_avail < mem)
 	    use_2m = 0;
+
+	  if (e)
+	   clib_error_free (e);
       }));
       /* *INDENT-ON* */
 
diff --git a/src/plugins/dpdk/hqos/hqos.c b/src/plugins/dpdk/hqos/hqos.c
index 813eb91..c9b8565 100644
--- a/src/plugins/dpdk/hqos/hqos.c
+++ b/src/plugins/dpdk/hqos/hqos.c
@@ -29,7 +29,6 @@
 #include <vnet/ethernet/ethernet.h>
 #include <dpdk/device/dpdk.h>
 
-#include <vlib/unix/physmem.h>
 #include <vlib/pci/pci.h>
 #include <vlibmemory/api.h>
 #include <vlibmemory/vl_memory_msg_enum.h>	/* enumerate all vlib messages */
diff --git a/src/plugins/ixge/ixge.c b/src/plugins/ixge/ixge.c
index e0150f4..222c148 100644
--- a/src/plugins/ixge/ixge.c
+++ b/src/plugins/ixge/ixge.c
@@ -2493,10 +2493,11 @@
     round_pow2 (xm->n_descriptors[rt], xm->n_descriptors_per_cache_line);
   dq->head_index = dq->tail_index = 0;
 
-  dq->descriptors = vlib_physmem_alloc_aligned (vm, &error,
-						dq->n_descriptors *
-						sizeof (dq->descriptors[0]),
-						128 /* per chip spec */ );
+  dq->descriptors =
+    vlib_physmem_alloc_aligned (vm, xm->physmem_region, &error,
+				dq->n_descriptors *
+				sizeof (dq->descriptors[0]),
+				128 /* per chip spec */ );
   if (error)
     return error;
 
@@ -2518,7 +2519,8 @@
 	  vlib_buffer_t *b =
 	    vlib_get_buffer (vm, dq->descriptor_buffer_indices[i]);
 	  dq->descriptors[i].rx_to_hw.tail_address =
-	    vlib_physmem_virtual_to_physical (vm, b->data);
+	    vlib_physmem_virtual_to_physical (vm, xm->physmem_region,
+					      b->data);
 	}
     }
   else
@@ -2526,7 +2528,8 @@
       u32 i;
 
       dq->tx.head_index_write_back =
-	vlib_physmem_alloc (vm, &error, CLIB_CACHE_LINE_BYTES);
+	vlib_physmem_alloc (vm, vm->buffer_main->physmem_region, &error,
+			    CLIB_CACHE_LINE_BYTES);
 
       for (i = 0; i < dq->n_descriptors; i++)
 	dq->descriptors[i].tx = xm->tx_descriptor_template;
@@ -2538,7 +2541,9 @@
     ixge_dma_regs_t *dr = get_dma_regs (xd, rt, queue_index);
     u64 a;
 
-    a = vlib_physmem_virtual_to_physical (vm, dq->descriptors);
+    a =
+      vlib_physmem_virtual_to_physical (vm, vm->buffer_main->physmem_region,
+					dq->descriptors);
     dr->descriptor_address[0] = a & 0xFFFFFFFF;
     dr->descriptor_address[1] = a >> (u64) 32;
     dr->n_descriptor_bytes = dq->n_descriptors * sizeof (dq->descriptors[0]);
@@ -2564,7 +2569,9 @@
 	dq->tx.head_index_write_back[0] = dq->head_index;
 
 	a =
-	  vlib_physmem_virtual_to_physical (vm, dq->tx.head_index_write_back);
+	  vlib_physmem_virtual_to_physical (vm,
+					    vm->buffer_main->physmem_region,
+					    dq->tx.head_index_write_back);
 	dr->tx.head_index_write_back_address[0] = /* enable bit */ 1 | a;
 	dr->tx.head_index_write_back_address[1] = (u64) a >> (u64) 32;
       }
@@ -2850,9 +2857,12 @@
   void *r;
   ixge_device_t *xd;
 
-  /* Device found: make sure we have dma memory. */
-  if (unix_physmem_is_fake (vm))
-    return clib_error_return (0, "no physical memory available");
+  /* Allocate physmem region for DMA buffers */
+  error = vlib_physmem_region_alloc (vm, "ixge decriptors", 2 << 20, 0,
+				     VLIB_PHYSMEM_F_INIT_MHEAP,
+				     &xm->physmem_region);
+  if (error)
+    return error;
 
   error = vlib_pci_map_resource (dev, 0, &r);
   if (error)
diff --git a/src/plugins/ixge/ixge.h b/src/plugins/ixge/ixge.h
index 779603b..42c1bfa 100644
--- a/src/plugins/ixge/ixge.h
+++ b/src/plugins/ixge/ixge.h
@@ -1266,6 +1266,8 @@
   u32 *rx_buffers_to_add;
 
   f64 time_last_stats_update;
+
+  vlib_physmem_region_index_t physmem_region;
 } ixge_main_t;
 
 ixge_main_t ixge_main;
diff --git a/src/vlib.am b/src/vlib.am
index 111dcfa..cab90e2 100644
--- a/src/vlib.am
+++ b/src/vlib.am
@@ -13,7 +13,7 @@
 
 
 lib_LTLIBRARIES += libvlib.la
-libvlib_la_LIBADD = libvppinfra.la -ldl -lpthread
+libvlib_la_LIBADD = libvppinfra.la -ldl -lpthread -lnuma
 libvlib_la_DEPENDENCIES = libvppinfra.la
 
 BUILT_SOURCES += vlib/config.h
@@ -65,6 +65,7 @@
   vlib/physmem.h				\
   vlib/pci/pci.h				\
   vlib/pci/pci_config.h				\
+  vlib/physmem_funcs.h				\
   vlib/threads.h				\
   vlib/trace_funcs.h				\
   vlib/trace.h					\
@@ -84,7 +85,6 @@
 nobase_include_HEADERS +=			\
   vlib/unix/cj.h				\
   vlib/unix/mc_socket.h				\
-  vlib/unix/physmem.h				\
   vlib/unix/plugin.h				\
   vlib/unix/unix.h
 
diff --git a/src/vlib/buffer.c b/src/vlib/buffer.c
index 908368c..a5ec0e0 100644
--- a/src/vlib/buffer.c
+++ b/src/vlib/buffer.c
@@ -47,6 +47,7 @@
 #include <vlib/unix/unix.h>
 
 vlib_buffer_callbacks_t *vlib_buffer_callbacks = 0;
+static u32 vlib_buffer_physmem_sz = 32 << 20;
 
 uword
 vlib_buffer_length_in_chain_slow_path (vlib_main_t * vm,
@@ -461,7 +462,8 @@
   u32 i;
 
   for (i = 0; i < vec_len (f->buffer_memory_allocated); i++)
-    vm->os_physmem_free (f->buffer_memory_allocated[i]);
+    vm->os_physmem_free (vm, vm->buffer_main->physmem_region,
+			 f->buffer_memory_allocated[i]);
   vec_free (f->name);
   vec_free (f->buffer_memory_allocated);
   vec_free (f->buffers);
@@ -552,9 +554,9 @@
       n_bytes = n_this_chunk * (sizeof (b[0]) + fl->n_data_bytes);
 
       /* drb: removed power-of-2 ASSERT */
-      buffers = vm->os_physmem_alloc_aligned (&vm->physmem_main,
-					      n_bytes,
-					      sizeof (vlib_buffer_t));
+      buffers =
+	vm->os_physmem_alloc_aligned (vm, vm->buffer_main->physmem_region,
+				      n_bytes, sizeof (vlib_buffer_t));
       if (!buffers)
 	return n_alloc;
 
@@ -1051,10 +1053,25 @@
 };
 /* *INDENT-ON* */
 
-void
-vlib_buffer_cb_init (struct vlib_main_t *vm)
+clib_error_t *
+vlib_buffer_main_init (struct vlib_main_t * vm)
 {
-  vlib_buffer_main_t *bm = vm->buffer_main;
+  vlib_buffer_main_t *bm;
+  clib_error_t *error;
+
+  vec_validate (vm->buffer_main, 0);
+  bm = vm->buffer_main;
+
+  if (vlib_buffer_callbacks)
+    {
+      /* external plugin has registered own buffer callbacks
+         so we just copy them  and quit */
+      vlib_buffer_main_t *bm = vm->buffer_main;
+      clib_memcpy (&bm->cb, vlib_buffer_callbacks,
+		   sizeof (vlib_buffer_callbacks_t));
+      bm->callbacks_registered = 1;
+      return 0;
+    }
 
   bm->cb.vlib_buffer_alloc_cb = &vlib_buffer_alloc_internal;
   bm->cb.vlib_buffer_alloc_from_free_list_cb =
@@ -1064,8 +1081,49 @@
   bm->cb.vlib_buffer_delete_free_list_cb =
     &vlib_buffer_delete_free_list_internal;
   clib_spinlock_init (&bm->buffer_known_hash_lockp);
+
+  /* allocate default region */
+  error = vlib_physmem_region_alloc (vm, "buffers",
+				     vlib_buffer_physmem_sz, 0,
+				     VLIB_PHYSMEM_F_INIT_MHEAP |
+				     VLIB_PHYSMEM_F_HAVE_BUFFERS,
+				     &bm->physmem_region);
+
+  if (error == 0)
+    return 0;
+
+  clib_error_free (error);
+
+  /* we my be running unpriviledged, so try to allocate fake physmem */
+  error = vlib_physmem_region_alloc (vm, "buffers (fake)",
+				     vlib_buffer_physmem_sz, 0,
+				     VLIB_PHYSMEM_F_FAKE |
+				     VLIB_PHYSMEM_F_INIT_MHEAP |
+				     VLIB_PHYSMEM_F_HAVE_BUFFERS,
+				     &bm->physmem_region);
+  return error;
 }
 
+static clib_error_t *
+vlib_buffers_configure (vlib_main_t * vm, unformat_input_t * input)
+{
+  u32 size_in_mb;
+
+  while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT)
+    {
+      if (unformat (input, "memory-size-in-mb %d", &size_in_mb))
+	vlib_buffer_physmem_sz = size_in_mb << 20;
+      else
+	return unformat_parse_error (input);
+    }
+
+  unformat_free (input);
+  return 0;
+}
+
+VLIB_EARLY_CONFIG_FUNCTION (vlib_buffers_configure, "buffers");
+
+
 /** @endcond */
 /*
  * fd.io coding-style-patch-verification: ON
diff --git a/src/vlib/buffer.h b/src/vlib/buffer.h
index 5504bf7..e47dbc6 100644
--- a/src/vlib/buffer.h
+++ b/src/vlib/buffer.h
@@ -408,6 +408,7 @@
      buffer index */
   uword buffer_mem_start;
   uword buffer_mem_size;
+  vlib_physmem_region_index_t physmem_region;
 
   /* Buffer free callback, for subversive activities */
     u32 (*buffer_free_callback) (struct vlib_main_t * vm,
@@ -442,7 +443,7 @@
 
 void vlib_buffer_add_mem_range (struct vlib_main_t *vm, uword start,
 				uword size);
-void vlib_buffer_cb_init (struct vlib_main_t *vm);
+clib_error_t *vlib_buffer_main_init (struct vlib_main_t *vm);
 
 typedef struct
 {
diff --git a/src/vlib/buffer_funcs.h b/src/vlib/buffer_funcs.h
index 78bf931..d51de6b 100644
--- a/src/vlib/buffer_funcs.h
+++ b/src/vlib/buffer_funcs.h
@@ -162,7 +162,7 @@
 always_inline u64
 vlib_get_buffer_data_physical_address (vlib_main_t * vm, u32 buffer_index)
 {
-  return vlib_physmem_offset_to_physical (&vm->physmem_main,
+  return vlib_physmem_offset_to_physical (vm, vm->buffer_main->physmem_region,
 					  (((uword) buffer_index) <<
 					   CLIB_LOG2_CACHE_LINE_BYTES) +
 					  STRUCT_OFFSET_OF (vlib_buffer_t,
@@ -455,43 +455,6 @@
     }
 }
 
-always_inline void *
-vlib_physmem_alloc_aligned (vlib_main_t * vm, clib_error_t ** error,
-			    uword n_bytes, uword alignment)
-{
-  void *r =
-    vm->os_physmem_alloc_aligned (&vm->physmem_main, n_bytes, alignment);
-  if (!r)
-    *error =
-      clib_error_return (0, "failed to allocate %wd bytes of I/O memory",
-			 n_bytes);
-  else
-    *error = 0;
-  return r;
-}
-
-/* By default allocate I/O memory with cache line alignment. */
-always_inline void *
-vlib_physmem_alloc (vlib_main_t * vm, clib_error_t ** error, uword n_bytes)
-{
-  return vlib_physmem_alloc_aligned (vm, error, n_bytes,
-				     CLIB_CACHE_LINE_BYTES);
-}
-
-always_inline void
-vlib_physmem_free (vlib_main_t * vm, void *mem)
-{
-  return vm->os_physmem_free (mem);
-}
-
-always_inline u64
-vlib_physmem_virtual_to_physical (vlib_main_t * vm, void *mem)
-{
-  vlib_physmem_main_t *pm = &vm->physmem_main;
-  uword o = pointer_to_uword (mem) - pm->virtual.start;
-  return vlib_physmem_offset_to_physical (pm, o);
-}
-
 /* Append given data to end of buffer, possibly allocating new buffers. */
 u32 vlib_buffer_add_data (vlib_main_t * vm,
 			  u32 free_list_index,
diff --git a/src/vlib/main.c b/src/vlib/main.c
index 5d99e89..7875f62 100644
--- a/src/vlib/main.c
+++ b/src/vlib/main.c
@@ -1705,22 +1705,16 @@
   if (!vm->name)
     vm->name = "VLIB";
 
-  vec_validate (vm->buffer_main, 0);
-  if (vlib_buffer_callbacks)
+  if ((error = unix_physmem_init (vm)))
     {
-      /* external plugin has registered own buffer callbacks
-         so we just copy them */
-      vlib_buffer_main_t *bm = vm->buffer_main;
-      clib_memcpy (&bm->cb, vlib_buffer_callbacks,
-		   sizeof (vlib_buffer_callbacks_t));
-      bm->callbacks_registered = 1;
+      clib_error_report (error);
+      goto done;
     }
-  else
+
+  if ((error = vlib_buffer_main_init (vm)))
     {
-      vlib_physmem_main_t *vpm = &vm->physmem_main;
-      vlib_buffer_cb_init (vm);
-      unix_physmem_init (vm, 0 /* fail_if_physical_memory_not_present */ );
-      vlib_buffer_add_mem_range (vm, vpm->virtual.start, vpm->virtual.size);
+      clib_error_report (error);
+      goto done;
     }
 
   if ((error = vlib_thread_init (vm)))
diff --git a/src/vlib/main.h b/src/vlib/main.h
index b63c63f..4c0cde3 100644
--- a/src/vlib/main.h
+++ b/src/vlib/main.h
@@ -107,9 +107,21 @@
 
   /* Allocate/free buffer memory for DMA transfers, descriptor rings, etc.
      buffer memory is guaranteed to be cache-aligned. */
-  void *(*os_physmem_alloc_aligned) (vlib_physmem_main_t * pm,
+
+  clib_error_t *(*os_physmem_region_alloc) (struct vlib_main_t * vm,
+					    char *name, u32 size,
+					    u8 numa_node, u32 flags,
+					    vlib_physmem_region_index_t *
+					    idx);
+
+  void (*os_physmem_region_free) (struct vlib_main_t * vm,
+				  vlib_physmem_region_index_t idx);
+
+  void *(*os_physmem_alloc_aligned) (struct vlib_main_t * vm,
+				     vlib_physmem_region_index_t idx,
 				     uword n_bytes, uword alignment);
-  void (*os_physmem_free) (void *x);
+  void (*os_physmem_free) (struct vlib_main_t * vm,
+			   vlib_physmem_region_index_t idx, void *x);
 
   /* Node graph main structure. */
   vlib_node_main_t node_main;
diff --git a/src/vlib/physmem.h b/src/vlib/physmem.h
index 9e7d52a..a7fed12 100644
--- a/src/vlib/physmem.h
+++ b/src/vlib/physmem.h
@@ -40,63 +40,36 @@
 #ifndef included_vlib_physmem_h
 #define included_vlib_physmem_h
 
+typedef u8 vlib_physmem_region_index_t;
+
 typedef struct
 {
-  uword start, end, size;
+  vlib_physmem_region_index_t index;
+  void *mem;
+  uword size;
+  int fd;
+  u8 log2_page_size;
+  u16 n_pages;
+  u32 page_mask;
+
+  void *heap;
+  u32 flags;
+#define VLIB_PHYSMEM_F_INIT_MHEAP (1<<0)
+#define VLIB_PHYSMEM_F_HAVE_BUFFERS (1<<1)
+#define VLIB_PHYSMEM_F_FAKE (1<<2)
+
+  u8 numa_node;
+  u64 *page_table;
+  u8 *name;
 } vlib_physmem_region_t;
 
+
+
 typedef struct
 {
-  vlib_physmem_region_t virtual;
-
-  uword log2_n_bytes_per_page;
-
-  /* 1 << log2_n_bytes_per_page - 1. */
-  uword page_mask;
-
-  u64 *page_table;
-
-  /* is fake physmem */
-  u8 is_fake;
+  vlib_physmem_region_t *regions;
 } vlib_physmem_main_t;
 
-always_inline u64
-vlib_physmem_offset_to_physical (vlib_physmem_main_t * pm, uword o)
-{
-  uword page_index = o >> pm->log2_n_bytes_per_page;
-  ASSERT (o < pm->virtual.size);
-  ASSERT (pm->page_table[page_index] != 0);
-  return (vec_elt (pm->page_table, page_index) + (o & pm->page_mask));
-}
-
-always_inline int
-vlib_physmem_is_virtual (vlib_physmem_main_t * pm, uword p)
-{
-  return p >= pm->virtual.start && p < pm->virtual.end;
-}
-
-always_inline uword
-vlib_physmem_offset_of (vlib_physmem_main_t * pm, void *p)
-{
-  uword a = pointer_to_uword (p);
-  uword o;
-
-  ASSERT (vlib_physmem_is_virtual (pm, a));
-  o = a - pm->virtual.start;
-
-  /* Offset must fit in 32 bits. */
-  ASSERT ((uword) o == a - pm->virtual.start);
-
-  return o;
-}
-
-always_inline void *
-vlib_physmem_at_offset (vlib_physmem_main_t * pm, uword offset)
-{
-  ASSERT (offset < pm->virtual.size);
-  return uword_to_pointer (pm->virtual.start + offset, void *);
-}
-
 #endif /* included_vlib_physmem_h */
 
 /*
diff --git a/src/vlib/physmem_funcs.h b/src/vlib/physmem_funcs.h
new file mode 100644
index 0000000..dbb8d9d
--- /dev/null
+++ b/src/vlib/physmem_funcs.h
@@ -0,0 +1,161 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * physmem.h: virtual <-> physical memory mapping for VLIB buffers
+ *
+ * Copyright (c) 2008 Eliot Dresselhaus
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ *  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ *  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ *  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ *  NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+ *  LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ *  OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ *  WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef included_vlib_physmem_funcs_h
+#define included_vlib_physmem_funcs_h
+
+always_inline vlib_physmem_region_t *
+vlib_physmem_get_region (vlib_main_t * vm, u8 index)
+{
+  vlib_physmem_main_t *vpm = &vm->physmem_main;
+  return pool_elt_at_index (vpm->regions, index);
+}
+
+always_inline u64
+vlib_physmem_offset_to_physical (vlib_main_t * vm,
+				 vlib_physmem_region_index_t idx, uword o)
+{
+  vlib_physmem_region_t *pr = vlib_physmem_get_region (vm, idx);
+  uword page_index = o >> pr->log2_page_size;
+  ASSERT (o < pr->size);
+  ASSERT (pr->page_table[page_index] != 0);
+  return (vec_elt (pr->page_table, page_index) + (o & pr->page_mask));
+}
+
+always_inline int
+vlib_physmem_is_virtual (vlib_main_t * vm, vlib_physmem_region_index_t idx,
+			 uword p)
+{
+  vlib_physmem_region_t *pr = vlib_physmem_get_region (vm, idx);
+  return p >= pointer_to_uword (pr->mem)
+    && p < (pointer_to_uword (pr->mem) + pr->size);
+}
+
+always_inline uword
+vlib_physmem_offset_of (vlib_main_t * vm, vlib_physmem_region_index_t idx,
+			void *p)
+{
+  vlib_physmem_region_t *pr = vlib_physmem_get_region (vm, idx);
+  uword a = pointer_to_uword (p);
+  uword o;
+
+  ASSERT (vlib_physmem_is_virtual (vm, idx, a));
+  o = a - pointer_to_uword (pr->mem);
+
+  /* Offset must fit in 32 bits. */
+  ASSERT ((uword) o == a - pointer_to_uword (pr->mem));
+
+  return o;
+}
+
+always_inline void *
+vlib_physmem_at_offset (vlib_main_t * vm, vlib_physmem_region_index_t idx,
+			uword offset)
+{
+  vlib_physmem_region_t *pr = vlib_physmem_get_region (vm, idx);
+  ASSERT (offset < pr->size);
+  return uword_to_pointer (pointer_to_uword (pr->mem) + offset, void *);
+}
+
+always_inline void *
+vlib_physmem_alloc_aligned (vlib_main_t * vm, vlib_physmem_region_index_t idx,
+			    clib_error_t ** error,
+			    uword n_bytes, uword alignment)
+{
+  void *r = vm->os_physmem_alloc_aligned (vm, idx, n_bytes, alignment);
+  if (!r)
+    *error =
+      clib_error_return (0, "failed to allocate %wd bytes of I/O memory",
+			 n_bytes);
+  else
+    *error = 0;
+  return r;
+}
+
+/* By default allocate I/O memory with cache line alignment. */
+always_inline void *
+vlib_physmem_alloc (vlib_main_t * vm, vlib_physmem_region_index_t idx,
+		    clib_error_t ** error, uword n_bytes)
+{
+  return vlib_physmem_alloc_aligned (vm, idx, error, n_bytes,
+				     CLIB_CACHE_LINE_BYTES);
+}
+
+always_inline void
+vlib_physmem_free (vlib_main_t * vm, vlib_physmem_region_index_t idx,
+		   void *mem)
+{
+  return vm->os_physmem_free (vm, idx, mem);
+}
+
+always_inline u64
+vlib_physmem_virtual_to_physical (vlib_main_t * vm,
+				  vlib_physmem_region_index_t idx, void *mem)
+{
+  vlib_physmem_main_t *vpm = &vm->physmem_main;
+  vlib_physmem_region_t *pr = pool_elt_at_index (vpm->regions, idx);
+  uword o = mem - pr->mem;
+  return vlib_physmem_offset_to_physical (vm, idx, o);
+}
+
+
+always_inline clib_error_t *
+vlib_physmem_region_alloc (vlib_main_t * vm, char *name, u32 size,
+			   u8 numa_node, u32 flags,
+			   vlib_physmem_region_index_t * idx)
+{
+  return vm->os_physmem_region_alloc (vm, name, size, numa_node, flags, idx);
+}
+
+always_inline void
+vlib_physmem_region_free (struct vlib_main_t *vm,
+			  vlib_physmem_region_index_t idx)
+{
+  vm->os_physmem_region_free (vm, idx);
+}
+
+#endif /* included_vlib_physmem_funcs_h */
+
+/*
+ * fd.io coding-style-patch-verification: ON
+ *
+ * Local Variables:
+ * eval: (c-set-style "gnu")
+ * End:
+ */
diff --git a/src/vlib/unix/physmem.c b/src/vlib/unix/physmem.c
index 27a5bac..d5d5d6c 100644
--- a/src/vlib/unix/physmem.c
+++ b/src/vlib/unix/physmem.c
@@ -37,24 +37,66 @@
  *  WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  */
 
-#include <vlib/unix/physmem.h>
+#include <unistd.h>
+#include <sys/types.h>
+#include <sys/mount.h>
+#include <sys/mman.h>
+#include <sys/fcntl.h>
+#include <sys/stat.h>
+#include <numa.h>
+#include <numaif.h>
 
-static physmem_main_t physmem_main;
+#include <vlib/vlib.h>
+#include <vlib/physmem.h>
+#include <vlib/unix/unix.h>
+
+#ifndef __NR_memfd_create
+#if defined __x86_64__
+#define __NR_memfd_create 319
+#elif defined __arm__
+#define __NR_memfd_create 385
+#elif defined __aarch64__
+#define __NR_memfd_create 279
+#else
+#error "__NR_memfd_create unknown for this architecture"
+#endif
+#endif
+
+static inline int
+memfd_create (const char *name, unsigned int flags)
+{
+  return syscall (__NR_memfd_create, name, flags);
+}
+
+#ifndef F_LINUX_SPECIFIC_BASE
+#define F_LINUX_SPECIFIC_BASE 1024
+#endif
+#define MFD_ALLOW_SEALING       0x0002U
+#define F_ADD_SEALS (F_LINUX_SPECIFIC_BASE + 9)
+#define F_GET_SEALS (F_LINUX_SPECIFIC_BASE + 10)
+
+#define F_SEAL_SEAL     0x0001	/* prevent further seals from being set */
+#define F_SEAL_SHRINK   0x0002	/* prevent file from shrinking */
+#define F_SEAL_GROW     0x0004	/* prevent file from growing */
+#define F_SEAL_WRITE    0x0008	/* prevent writes */
 
 static void *
-unix_physmem_alloc_aligned (vlib_physmem_main_t * vpm, uword n_bytes,
-			    uword alignment)
+unix_physmem_alloc_aligned (vlib_main_t * vm, vlib_physmem_region_index_t idx,
+			    uword n_bytes, uword alignment)
 {
-  physmem_main_t *pm = &physmem_main;
+  vlib_physmem_region_t *pr = vlib_physmem_get_region (vm, idx);
   uword lo_offset, hi_offset;
   uword *to_free = 0;
 
+  if (pr->heap == 0)
+    return 0;
+
   /* IO memory is always at least cache aligned. */
   alignment = clib_max (alignment, CLIB_CACHE_LINE_BYTES);
 
   while (1)
     {
-      mheap_get_aligned (pm->heap, n_bytes,
+      mheap_get_aligned (pr->heap, n_bytes,
 			 /* align */ alignment,
 			 /* align offset */ 0,
 			 &lo_offset);
@@ -63,11 +105,14 @@
       if (lo_offset == ~0)
 	break;
 
+      if (pr->flags & VLIB_PHYSMEM_F_FAKE)
+	break;
+
       /* Make sure allocation does not span DMA physical chunk boundary. */
       hi_offset = lo_offset + n_bytes - 1;
 
-      if ((lo_offset >> vpm->log2_n_bytes_per_page) ==
-	  (hi_offset >> vpm->log2_n_bytes_per_page))
+      if ((lo_offset >> pr->log2_page_size) ==
+	  (hi_offset >> pr->log2_page_size))
 	break;
 
       /* Allocation would span chunk boundary, queue it to be freed as soon as
@@ -79,134 +124,267 @@
     {
       uword i;
       for (i = 0; i < vec_len (to_free); i++)
-	mheap_put (pm->heap, to_free[i]);
+	mheap_put (pr->heap, to_free[i]);
       vec_free (to_free);
     }
 
-  return lo_offset != ~0 ? pm->heap + lo_offset : 0;
+  return lo_offset != ~0 ? pr->heap + lo_offset : 0;
 }
 
 static void
-unix_physmem_free (void *x)
+unix_physmem_free (vlib_main_t * vm, vlib_physmem_region_index_t idx, void *x)
 {
-  physmem_main_t *pm = &physmem_main;
-
+  vlib_physmem_region_t *pr = vlib_physmem_get_region (vm, idx);
   /* Return object to region's heap. */
-  mheap_put (pm->heap, x - pm->heap);
+  mheap_put (pr->heap, x - pr->heap);
 }
 
-static void
-htlb_shutdown (void)
+static u64
+get_page_paddr (int fd, uword addr)
 {
-  physmem_main_t *pm = &physmem_main;
+  int pagesize = sysconf (_SC_PAGESIZE);
+  u64 seek, pagemap = 0;
 
-  if (!pm->shmid)
-    return;
-  shmctl (pm->shmid, IPC_RMID, 0);
-  pm->shmid = 0;
+  seek = ((u64) addr / pagesize) * sizeof (u64);
+  if (lseek (fd, seek, SEEK_SET) != seek)
+    {
+      clib_unix_warning ("lseek to 0x%llx", seek);
+      return 0;
+    }
+  if (read (fd, &pagemap, sizeof (pagemap)) != (sizeof (pagemap)))
+    {
+      clib_unix_warning ("read ptbits");
+      return 0;
+    }
+  if ((pagemap & (1ULL << 63)) == 0)
+    return 0;
+
+  pagemap &= pow2_mask (55);
+
+  return pagemap * pagesize;
 }
 
-/* try to use huge TLB pgs if possible */
-static int
-htlb_init (vlib_main_t * vm)
+static clib_error_t *
+unix_physmem_region_alloc (vlib_main_t * vm, char *name, u32 size,
+			   u8 numa_node, u32 flags,
+			   vlib_physmem_region_index_t * idx)
 {
   vlib_physmem_main_t *vpm = &vm->physmem_main;
-  physmem_main_t *pm = &physmem_main;
-  u64 hugepagesize, pagesize;
-  u64 pfn, seek_loc;
-  u64 cur, physaddr, ptbits;
-  int fd, i;
+  vlib_physmem_region_t *pr;
+  clib_error_t *error = 0;
+  int pagemap_fd = -1;
+  u8 *mount_dir = 0;
+  u8 *filename = 0;
+  struct stat st;
+  int old_mpol;
+  int mmap_flags;
+  struct bitmask *old_mask = numa_allocate_nodemask ();
 
-  pm->shmid = shmget (11 /* key, my amp goes to 11 */ , pm->mem_size,
-		      IPC_CREAT | SHM_HUGETLB | SHM_R | SHM_W);
-  if (pm->shmid < 0)
+  if (geteuid () != 0 && (flags & VLIB_PHYSMEM_F_FAKE) == 0)
+    return clib_error_return (0, "not allowed");
+
+  pool_get (vpm->regions, pr);
+
+  if ((pr - vpm->regions) >= 256)
     {
-      clib_unix_warning ("shmget");
-      return 0;
+      error = clib_error_return (0, "maximum number of regions reached");
+      goto error;
     }
 
-  pm->mem = shmat (pm->shmid, NULL, 0 /* flags */ );
-  if (pm->mem == 0)
+  pr->index = pr - vpm->regions;
+  pr->fd = -1;
+  pr->flags = flags;
+
+  if (get_mempolicy (&old_mpol, old_mask->maskp, old_mask->size + 1, NULL, 0)
+      == -1)
     {
-      shmctl (pm->shmid, IPC_RMID, 0);
-      return 0;
+      error = clib_error_return_unix (0, "get_mempolicy");
+      goto error;
     }
 
-  memset (pm->mem, 0, pm->mem_size);
-
-  /* $$$ get page size info from /proc/meminfo */
-  hugepagesize = 2 << 20;
-  pagesize = 4 << 10;
-  vpm->log2_n_bytes_per_page = min_log2 (hugepagesize);
-  vec_resize (vpm->page_table, pm->mem_size / hugepagesize);
-
-  vpm->page_mask = pow2_mask (vpm->log2_n_bytes_per_page);
-  vpm->virtual.start = pointer_to_uword (pm->mem);
-  vpm->virtual.size = pm->mem_size;
-  vpm->virtual.end = vpm->virtual.start + vpm->virtual.size;
-
-  fd = open ("/proc/self/pagemap", O_RDONLY);
-
-  if (fd < 0)
+  if ((flags & VLIB_PHYSMEM_F_FAKE) == 0)
     {
-      (void) shmdt (pm->mem);
-      return 0;
-    }
-
-  pm->heap = mheap_alloc_with_flags (pm->mem, pm->mem_size,
-				     /* Don't want mheap mmap/munmap with IO memory. */
-				     MHEAP_FLAG_DISABLE_VM |
-				     MHEAP_FLAG_THREAD_SAFE);
-
-  cur = pointer_to_uword (pm->mem);
-  i = 0;
-
-  while (cur < pointer_to_uword (pm->mem) + pm->mem_size)
-    {
-      pfn = (u64) cur / pagesize;
-      seek_loc = pfn * sizeof (u64);
-      if (lseek (fd, seek_loc, SEEK_SET) != seek_loc)
+      if ((pagemap_fd = open ((char *) "/proc/self/pagemap", O_RDONLY)) == -1)
 	{
-	  clib_unix_warning ("lseek to 0x%llx", seek_loc);
-	  shmctl (pm->shmid, IPC_RMID, 0);
-	  close (fd);
-	  return 0;
-	}
-      if (read (fd, &ptbits, sizeof (ptbits)) != (sizeof (ptbits)))
-	{
-	  clib_unix_warning ("read ptbits");
-	  shmctl (pm->shmid, IPC_RMID, 0);
-	  close (fd);
-	  return 0;
+	  error = clib_error_return_unix (0, "open '/proc/self/pagemap'");
+	  goto error;
 	}
 
-      /* bits 0-54 are the physical page number */
-      physaddr = (ptbits & 0x7fffffffffffffULL) * pagesize;
-      if (CLIB_DEBUG > 1)
-	fformat (stderr, "pm: virtual 0x%llx physical 0x%llx\n",
-		 cur, physaddr);
-      vpm->page_table[i++] = physaddr;
+      mount_dir = format (0, "%s/physmem_region%d%c",
+			  vlib_unix_get_runtime_dir (), pr->index, 0);
+      filename = format (0, "%s/mem%c", mount_dir, 0);
 
-      cur += hugepagesize;
+      unlink ((char *) mount_dir);
+
+      error = vlib_unix_recursive_mkdir ((char *) mount_dir);
+      if (error)
+	goto error;
+
+      if (mount ("none", (char *) mount_dir, "hugetlbfs", 0, NULL))
+	{
+	  error = clib_error_return_unix (0, "mount hugetlb directory '%s'",
+					  mount_dir);
+	  goto error;
+	}
+
+      if ((pr->fd = open ((char *) filename, O_CREAT | O_RDWR, 0755)) == -1)
+	{
+	  error = clib_error_return_unix (0, "open");
+	  goto error;
+	}
+
+      mmap_flags = MAP_SHARED | MAP_HUGETLB | MAP_LOCKED;
     }
-  close (fd);
-  atexit (htlb_shutdown);
-  return 1;
+  else
+    {
+      if ((pr->fd = memfd_create (name, MFD_ALLOW_SEALING)) == -1)
+	return clib_error_return_unix (0, "memfd_create");
+
+      if ((fcntl (pr->fd, F_ADD_SEALS, F_SEAL_SHRINK)) == -1)
+	{
+	  error =
+	    clib_error_return_unix (0, "fcntl (F_ADD_SEALS, F_SEAL_SHRINK)");
+	  goto error;
+	}
+      mmap_flags = MAP_SHARED;
+    }
+
+  if (fstat (pr->fd, &st))
+    {
+      error = clib_error_return_unix (0, "fstat");
+      goto error;
+    }
+
+  pr->log2_page_size = min_log2 (st.st_blksize);
+  pr->n_pages = ((size - 1) >> pr->log2_page_size) + 1;
+  size = pr->n_pages * (1 << pr->log2_page_size);
+
+  if ((ftruncate (pr->fd, size)) == -1)
+    {
+      error = clib_error_return_unix (0, "ftruncate length: %d", size);
+      goto error;
+    }
+
+  if ((flags & VLIB_PHYSMEM_F_FAKE) == 0)
+    {
+      error = vlib_sysfs_prealloc_hugepages (numa_node,
+					     1 << (pr->log2_page_size - 10),
+					     pr->n_pages);
+      if (error)
+	goto error;
+    }
+
+  numa_set_preferred (numa_node);
+
+  pr->mem = mmap (0, size, (PROT_READ | PROT_WRITE), mmap_flags, pr->fd, 0);
+
+  if (pr->mem == MAP_FAILED)
+    {
+      pr->mem = 0;
+      error = clib_error_return_unix (0, "mmap");
+      goto error;
+    }
+
+  if (set_mempolicy (old_mpol, old_mask->maskp, old_mask->size + 1) == -1)
+    {
+      error = clib_error_return_unix (0, "set_mempolicy");
+      goto error;
+    }
+
+  pr->size = pr->n_pages << pr->log2_page_size;
+  pr->page_mask = (1 << pr->log2_page_size) - 1;
+  pr->numa_node = numa_node;
+  pr->name = format (0, "%s", name);
+
+  if ((flags & VLIB_PHYSMEM_F_FAKE) == 0)
+    {
+      int i;
+      for (i = 0; i < pr->n_pages; i++)
+	{
+	  void *ptr = pr->mem + (i << pr->log2_page_size);
+	  int node;
+	  move_pages (0, 1, &ptr, 0, &node, 0);
+	  if (numa_node != node)
+	    {
+	      clib_warning
+		("physmem page for region \'%s\' allocated on the wrong"
+		 " numa node (requested %u actual %u)", pr->name,
+		 pr->numa_node, node, i);
+	      break;
+	    }
+	}
+    }
+
+  if (flags & VLIB_PHYSMEM_F_INIT_MHEAP)
+    {
+      pr->heap = mheap_alloc_with_flags (pr->mem, pr->size,
+					 /* Don't want mheap mmap/munmap with IO memory. */
+					 MHEAP_FLAG_DISABLE_VM |
+					 MHEAP_FLAG_THREAD_SAFE);
+      fformat (stdout, "%U", format_mheap, pr->heap, /* verbose */ 1);
+    }
+
+  if (flags & VLIB_PHYSMEM_F_HAVE_BUFFERS)
+    {
+      vlib_buffer_add_mem_range (vm, pointer_to_uword (pr->mem), pr->size);
+    }
+
+  *idx = pr->index;
+
+  if ((flags & VLIB_PHYSMEM_F_FAKE) == 0)
+    {
+      int i;
+      for (i = 0; i < pr->n_pages; i++)
+	{
+	  uword vaddr =
+	    pointer_to_uword (pr->mem) + (((u64) i) << pr->log2_page_size);
+	  u64 page_paddr = get_page_paddr (pagemap_fd, vaddr);
+	  vec_add1 (pr->page_table, page_paddr);
+	}
+    }
+
+  goto done;
+
+error:
+  if (pr->fd > -1)
+    close (pr->fd);
+
+  if (pr->mem)
+    munmap (pr->mem, size);
+
+  memset (pr, 0, sizeof (*pr));
+  pool_put (vpm->regions, pr);
+
+done:
+  if (mount_dir)
+    {
+      umount2 ((char *) mount_dir, MNT_DETACH);
+      rmdir ((char *) mount_dir);
+      vec_free (mount_dir);
+    }
+  numa_free_cpumask (old_mask);
+  vec_free (filename);
+  if (pagemap_fd > -1)
+    close (pagemap_fd);
+  return error;
 }
 
-int vlib_app_physmem_init (vlib_main_t * vm,
-			   physmem_main_t * pm, int) __attribute__ ((weak));
-int
-vlib_app_physmem_init (vlib_main_t * vm, physmem_main_t * pm, int x)
+static void
+unix_physmem_region_free (vlib_main_t * vm, vlib_physmem_region_index_t idx)
 {
-  return 0;
+  vlib_physmem_main_t *vpm = &vm->physmem_main;
+  vlib_physmem_region_t *pr = vlib_physmem_get_region (vm, idx);
+
+  if (pr->fd > 0)
+    close (pr->fd);
+  munmap (pr->mem, pr->size);
+  vec_free (pr->name);
+  pool_put (vpm->regions, pr);
 }
 
 clib_error_t *
-unix_physmem_init (vlib_main_t * vm, int physical_memory_required)
+unix_physmem_init (vlib_main_t * vm)
 {
-  vlib_physmem_main_t *vpm = &vm->physmem_main;
-  physmem_main_t *pm = &physmem_main;
   clib_error_t *error = 0;
 
   /* Avoid multiple calls. */
@@ -215,50 +393,9 @@
 
   vm->os_physmem_alloc_aligned = unix_physmem_alloc_aligned;
   vm->os_physmem_free = unix_physmem_free;
-  pm->mem = MAP_FAILED;
+  vm->os_physmem_region_alloc = unix_physmem_region_alloc;
+  vm->os_physmem_region_free = unix_physmem_region_free;
 
-  if (pm->mem_size == 0)
-    pm->mem_size = 16 << 20;
-
-  /* OK, Mr. App, you tell us */
-  if (vlib_app_physmem_init (vm, pm, physical_memory_required))
-    return 0;
-
-  if (!pm->no_hugepages && htlb_init (vm))
-    {
-      fformat (stderr, "%s: use huge pages\n", __FUNCTION__);
-      return 0;
-    }
-
-  pm->mem =
-    mmap (0, pm->mem_size, PROT_READ | PROT_WRITE,
-	  MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
-  if (pm->mem == MAP_FAILED)
-    {
-      error = clib_error_return_unix (0, "mmap");
-      goto done;
-    }
-
-  pm->heap = mheap_alloc (pm->mem, pm->mem_size);
-
-  /* Identity map with a single page. */
-  vpm->log2_n_bytes_per_page = min_log2 (pm->mem_size);
-  vec_add1 (vpm->page_table, pointer_to_uword (pm->mem));
-
-  vpm->page_mask = pow2_mask (vpm->log2_n_bytes_per_page);
-  vpm->virtual.start = pointer_to_uword (pm->mem);
-  vpm->virtual.size = pm->mem_size;
-  vpm->virtual.end = vpm->virtual.start + vpm->virtual.size;
-  vpm->is_fake = 1;
-
-  fformat (stderr, "%s: use fake dma pages\n", __FUNCTION__);
-
-done:
-  if (error)
-    {
-      if (pm->mem != MAP_FAILED)
-	munmap (pm->mem, pm->mem_size);
-    }
   return error;
 }
 
@@ -266,12 +403,22 @@
 show_physmem (vlib_main_t * vm,
 	      unformat_input_t * input, vlib_cli_command_t * cmd)
 {
-  physmem_main_t *pm = &physmem_main;
+  vlib_physmem_main_t *vpm = &vm->physmem_main;
+  vlib_physmem_region_t *pr;
 
-  if (pm->heap)
-    vlib_cli_output (vm, "%U", format_mheap, pm->heap, /* verbose */ 1);
-  else
-    vlib_cli_output (vm, "No physmem allocated.");
+  /* *INDENT-OFF* */
+  pool_foreach (pr, vpm->regions, (
+    {
+      vlib_cli_output (vm, "index %u name '%s' page-size %uKB num-pages %d "
+		       "numa-node %u fd %d\n",
+		       pr->index, pr->name, (1 << (pr->log2_page_size -10)),
+		       pr->n_pages, pr->numa_node, pr->fd);
+      if (pr->heap)
+	vlib_cli_output (vm, "  %U", format_mheap, pr->heap, /* verbose */ 1);
+      else
+	vlib_cli_output (vm, "  no heap\n");
+    }));
+  /* *INDENT-ON* */
   return 0;
 }
 
@@ -283,177 +430,6 @@
 };
 /* *INDENT-ON* */
 
-static clib_error_t *
-show_affinity (vlib_main_t * vm,
-	       unformat_input_t * input, vlib_cli_command_t * cmd)
-{
-  cpu_set_t set;
-  cpu_set_t *setp = &set;
-  int i, rv;
-  u8 *s = 0;
-  int first_set_bit_in_run = -1;
-  int last_set_bit_in_run = -1;
-  int output_done = 0;
-
-  rv = sched_getaffinity (0 /* pid, 0 = this proc */ ,
-			  sizeof (*setp), setp);
-  if (rv < 0)
-    {
-      vlib_cli_output (vm, "Couldn't get affinity mask: %s\n",
-		       strerror (errno));
-      return 0;
-    }
-
-  for (i = 0; i < 64; i++)
-    {
-      if (CPU_ISSET (i, setp))
-	{
-	  if (first_set_bit_in_run == -1)
-	    {
-	      first_set_bit_in_run = i;
-	      last_set_bit_in_run = i;
-	      if (output_done)
-		s = format (s, ",");
-	      s = format (s, "%d-", i);
-	      output_done = 1;
-	    }
-	  else
-	    {
-	      if (i == (last_set_bit_in_run + 1))
-		last_set_bit_in_run = i;
-	    }
-	}
-      else
-	{
-	  if (first_set_bit_in_run != -1)
-	    {
-	      if (first_set_bit_in_run == (i - 1))
-		{
-		  _vec_len (s) -= 2 + ((first_set_bit_in_run / 10));
-		}
-	      s = format (s, "%d", last_set_bit_in_run);
-	      first_set_bit_in_run = -1;
-	      last_set_bit_in_run = -1;
-	    }
-	}
-    }
-
-  if (first_set_bit_in_run != -1)
-    s = format (s, "%d", first_set_bit_in_run);
-
-  vlib_cli_output (vm, "Process runs on: %v", s);
-  return 0;
-}
-
-/* *INDENT-OFF* */
-VLIB_CLI_COMMAND (show_affinity_command, static) = {
-  .path = "show affinity",
-  .short_help = "Show process cpu affinity",
-  .function = show_affinity,
-};
-/* *INDENT-ON* */
-
-static clib_error_t *
-set_affinity (vlib_main_t * vm,
-	      unformat_input_t * input, vlib_cli_command_t * cmd)
-{
-  cpu_set_t set;
-  cpu_set_t *setp = &set;
-  int i, rv;
-  int another_round;
-  u32 first, last;
-
-  memset (setp, 0, sizeof (*setp));
-
-  do
-    {
-      another_round = 0;
-      if (unformat (input, "%d-%d,", &first, &last))
-	{
-	  if (first > 64 || last > 64)
-	    {
-	    barf1:
-	      vlib_cli_output (vm, "range %d-%d invalid", first, last);
-	      return 0;
-	    }
-
-	  for (i = first; i <= last; i++)
-	    CPU_SET (i, setp);
-	  another_round = 1;
-	}
-      else if (unformat (input, "%d-%d", &first, &last))
-	{
-	  if (first > 64 || last > 64)
-	    goto barf1;
-
-	  for (i = first; i <= last; i++)
-	    CPU_SET (i, setp);
-	}
-      else if (unformat (input, "%d,", &first))
-	{
-	  if (first > 64)
-	    {
-	    barf2:
-	      vlib_cli_output (vm, "cpu %d invalid", first);
-	      return 0;
-	    }
-	  CPU_SET (first, setp);
-	  another_round = 1;
-	}
-      else if (unformat (input, "%d", &first))
-	{
-	  if (first > 64)
-	    goto barf2;
-
-	  CPU_SET (first, setp);
-	}
-    }
-  while (another_round);
-
-  rv = sched_setaffinity (0 /* pid, 0 = this proc */ ,
-			  sizeof (*setp), setp);
-
-  if (rv < 0)
-    {
-      vlib_cli_output (vm, "Couldn't get affinity mask: %s\n",
-		       strerror (errno));
-      return 0;
-    }
-  return show_affinity (vm, input, cmd);
-}
-
-/* *INDENT-OFF* */
-VLIB_CLI_COMMAND (set_affinity_command, static) = {
-  .path = "set affinity",
-  .short_help = "Set process cpu affinity",
-  .function = set_affinity,
-};
-/* *INDENT-ON* */
-
-static clib_error_t *
-vlib_physmem_configure (vlib_main_t * vm, unformat_input_t * input)
-{
-  physmem_main_t *pm = &physmem_main;
-  u32 size_in_mb;
-
-  while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT)
-    {
-      if (unformat (input, "no-huge") || unformat (input, "no-huge-pages"))
-	pm->no_hugepages = 1;
-
-      else if (unformat (input, "size-in-mb %d", &size_in_mb) ||
-	       unformat (input, "size %d", &size_in_mb))
-	pm->mem_size = size_in_mb << 20;
-      else
-	return unformat_parse_error (input);
-    }
-
-  unformat_free (input);
-  return 0;
-}
-
-VLIB_EARLY_CONFIG_FUNCTION (vlib_physmem_configure, "physmem");
-
 /*
  * fd.io coding-style-patch-verification: ON
  *
diff --git a/src/vlib/unix/physmem.h b/src/vlib/unix/physmem.h
deleted file mode 100644
index 5519a7d..0000000
--- a/src/vlib/unix/physmem.h
+++ /dev/null
@@ -1,65 +0,0 @@
-/*
- * Copyright (c) 2015 Cisco and/or its affiliates.
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at:
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#ifndef __included_physmem_h__
-#define __included_physmem_h__
-
-/* Manage I/O physical memory. */
-#define _GNU_SOURCE
-#include <sched.h>
-#include <vppinfra/cache.h>
-#include <vppinfra/error.h>
-#include <vppinfra/mheap.h>
-#include <vppinfra/os.h>
-
-#include <vlib/vlib.h>
-#include <vlib/unix/unix.h>
-
-#include <sys/fcntl.h>		/* for open */
-#include <sys/file.h>		/* for flock */
-#include <sys/ioctl.h>
-#include <sys/types.h>
-#include <sys/stat.h>
-#include <fcntl.h>
-#include <sys/ipc.h>
-#include <sys/shm.h>
-
-typedef struct
-{
-  /* Virtual memory via mmaped. */
-  void *mem;
-
-  /* Size in bytes. */
-  uword mem_size;
-
-  /* Heap allocated out of virtual memory. */
-  void *heap;
-
-  /* huge TLB segment id */
-  int shmid;
-
-  /* should we try to use htlb ? */
-  int no_hugepages;
-
-} physmem_main_t;
-
-#endif /* __included_physmem_h__ */
-
-/*
- * fd.io coding-style-patch-verification: ON
- *
- * Local Variables:
- * eval: (c-set-style "gnu")
- * End:
- */
diff --git a/src/vlib/unix/unix.h b/src/vlib/unix/unix.h
index 97f5894..b5a3342 100644
--- a/src/vlib/unix/unix.h
+++ b/src/vlib/unix/unix.h
@@ -195,18 +195,7 @@
 /* Main function for Unix VLIB. */
 int vlib_unix_main (int argc, char *argv[]);
 
-/* Call to allocate/initialize physical DMA memory subsystem.
-   This is not an init function so that users can explicitly enable/disable
-   physmem when its not needed. */
-clib_error_t *unix_physmem_init (vlib_main_t * vm,
-				 int fail_if_physical_memory_not_present);
-
-static inline int
-unix_physmem_is_fake (vlib_main_t * vm)
-{
-  vlib_physmem_main_t *vpm = &vm->physmem_main;
-  return vpm->is_fake;
-}
+clib_error_t *unix_physmem_init (vlib_main_t * vm);
 
 /* Set prompt for CLI. */
 void vlib_unix_cli_set_prompt (char *prompt);
@@ -234,7 +223,16 @@
 
 u8 *vlib_sysfs_link_to_name (char *link);
 
-int vlib_sysfs_get_free_hugepages (unsigned int numa_node, int page_size);
+clib_error_t *vlib_sysfs_set_nr_hugepages (unsigned int numa_node,
+					   int page_size, int nr);
+clib_error_t *vlib_sysfs_get_nr_hugepages (unsigned int numa_node,
+					   int page_size, int *v);
+clib_error_t *vlib_sysfs_get_free_hugepages (unsigned int numa_node,
+					     int page_size, int *v);
+clib_error_t *vlib_sysfs_get_surplus_hugepages (unsigned int numa_node,
+						int page_size, int *v);
+clib_error_t *vlib_sysfs_prealloc_hugepages (unsigned int numa_node,
+					     int page_size, int nr);
 
 clib_error_t *foreach_directory_file (char *dir_name,
 				      clib_error_t * (*f) (void *arg,
diff --git a/src/vlib/unix/util.c b/src/vlib/unix/util.c
index 312cc9b..0e252ac 100644
--- a/src/vlib/unix/util.c
+++ b/src/vlib/unix/util.c
@@ -189,37 +189,132 @@
   return s;
 }
 
-int
-vlib_sysfs_get_free_hugepages (unsigned int numa_node, int page_size)
+clib_error_t *
+vlib_sysfs_set_nr_hugepages (unsigned int numa_node, int page_size, int nr)
 {
+  clib_error_t *error = 0;
   struct stat sb;
   u8 *p = 0;
-  int r = -1;
 
   p = format (p, "/sys/devices/system/node/node%u%c", numa_node, 0);
 
   if (stat ((char *) p, &sb) == 0)
     {
       if (S_ISDIR (sb.st_mode) == 0)
-	goto done;
+	{
+	  error = clib_error_return (0, "'%s' is not directory", p);
+	  goto done;
+	}
     }
   else if (numa_node == 0)
     {
       vec_reset_length (p);
       p = format (p, "/sys/kernel/mm%c", 0);
       if (stat ((char *) p, &sb) < 0 || S_ISDIR (sb.st_mode) == 0)
-	goto done;
+	{
+	  error = clib_error_return (0, "'%s' does not exist or it is not "
+				     "directory", p);
+	  goto done;
+	}
     }
   else
-    goto done;
+    {
+      error = clib_error_return (0, "'%s' does not exist", p);
+      goto done;
+    }
 
   _vec_len (p) -= 1;
-  p = format (p, "/hugepages/hugepages-%ukB/free_hugepages%c", page_size, 0);
-  vlib_sysfs_read ((char *) p, "%d", &r);
+  p = format (p, "/hugepages/hugepages-%ukB/nr_hugepages%c", page_size, 0);
+  vlib_sysfs_write ((char *) p, "%d", nr);
 
 done:
   vec_free (p);
-  return r;
+  return error;
+}
+
+
+static clib_error_t *
+vlib_sysfs_get_xxx_hugepages (char *type, unsigned int numa_node,
+			      int page_size, int *val)
+{
+  clib_error_t *error = 0;
+  struct stat sb;
+  u8 *p = 0;
+
+  p = format (p, "/sys/devices/system/node/node%u%c", numa_node, 0);
+
+  if (stat ((char *) p, &sb) == 0)
+    {
+      if (S_ISDIR (sb.st_mode) == 0)
+	{
+	  error = clib_error_return (0, "'%s' is not directory", p);
+	  goto done;
+	}
+    }
+  else if (numa_node == 0)
+    {
+      vec_reset_length (p);
+      p = format (p, "/sys/kernel/mm%c", 0);
+      if (stat ((char *) p, &sb) < 0 || S_ISDIR (sb.st_mode) == 0)
+	{
+	  error = clib_error_return (0, "'%s' does not exist or it is not "
+				     "directory", p);
+	  goto done;
+	}
+    }
+  else
+    {
+      error = clib_error_return (0, "'%s' does not exist", p);
+      goto done;
+    }
+
+  _vec_len (p) -= 1;
+  p = format (p, "/hugepages/hugepages-%ukB/%s_hugepages%c", page_size,
+	      type, 0);
+  error = vlib_sysfs_read ((char *) p, "%d", val);
+
+done:
+  vec_free (p);
+  return error;
+}
+
+clib_error_t *
+vlib_sysfs_get_free_hugepages (unsigned int numa_node, int page_size, int *v)
+{
+  return vlib_sysfs_get_xxx_hugepages ("free", numa_node, page_size, v);
+}
+
+clib_error_t *
+vlib_sysfs_get_nr_hugepages (unsigned int numa_node, int page_size, int *v)
+{
+  return vlib_sysfs_get_xxx_hugepages ("nr", numa_node, page_size, v);
+}
+
+clib_error_t *
+vlib_sysfs_get_surplus_hugepages (unsigned int numa_node, int page_size,
+				  int *v)
+{
+  return vlib_sysfs_get_xxx_hugepages ("surplus", numa_node, page_size, v);
+}
+
+clib_error_t *
+vlib_sysfs_prealloc_hugepages (unsigned int numa_node, int page_size, int nr)
+{
+  clib_error_t *error = 0;
+  int n, needed;
+  error = vlib_sysfs_get_free_hugepages (numa_node, page_size, &n);
+  if (error)
+    return error;
+  needed = nr - n;
+  if (needed <= 0)
+    return 0;
+
+  error = vlib_sysfs_get_nr_hugepages (numa_node, page_size, &n);
+  if (error)
+    return error;
+  clib_warning ("pre-allocating %u additional %uK hugepages on numa node %u",
+		needed, page_size, numa_node);
+  return vlib_sysfs_set_nr_hugepages (numa_node, page_size, n + needed);
 }
 
 clib_error_t *
diff --git a/src/vlib/vlib.h b/src/vlib/vlib.h
index b146a49..eed5c5b 100644
--- a/src/vlib/vlib.h
+++ b/src/vlib/vlib.h
@@ -50,6 +50,7 @@
 struct vlib_main_t;
 
 /* All includes in alphabetical order. */
+#include <vlib/physmem.h>
 #include <vlib/buffer.h>
 #include <vlib/cli.h>
 #include <vlib/counter.h>
@@ -57,7 +58,6 @@
 #include <vlib/init.h>
 #include <vlib/mc.h>
 #include <vlib/node.h>
-#include <vlib/physmem.h>
 #include <vlib/trace.h>
 
 /* Main include depends on other vlib/ includes so we put it last. */
@@ -65,6 +65,7 @@
 
 /* Inline/extern function declarations. */
 #include <vlib/threads.h>
+#include <vlib/physmem_funcs.h>
 #include <vlib/buffer_funcs.h>
 #include <vlib/cli_funcs.h>
 #include <vlib/error_funcs.h>