vlib physmem rework
This patch adds supprot support for multiple numa-aware physmem regions.
Change-Id: I5c69a6f4da33c8ee21bdb8604d52fd2886f2327e
Signed-off-by: Damjan Marion <damarion@cisco.com>
diff --git a/src/examples/vlib/main_stub.c b/src/examples/vlib/main_stub.c
index 4d74bd7..3b19c53 100644
--- a/src/examples/vlib/main_stub.c
+++ b/src/examples/vlib/main_stub.c
@@ -27,8 +27,7 @@
{
clib_error_t *error;
- if ((error =
- unix_physmem_init (vm, /* fail_if_physical_memory_not_present */ 0)))
+ if ((error = unix_physmem_init (vm)))
return error;
if ((error = vlib_call_init_function (vm, unix_cli_init)))
diff --git a/src/plugins/dpdk/api/dpdk_api.c b/src/plugins/dpdk/api/dpdk_api.c
index 08afdd7..97c4bc7 100755
--- a/src/plugins/dpdk/api/dpdk_api.c
+++ b/src/plugins/dpdk/api/dpdk_api.c
@@ -20,7 +20,6 @@
#include <vnet/ethernet/ethernet.h>
#include <dpdk/device/dpdk.h>
-#include <vlib/unix/physmem.h>
#include <vlib/pci/pci.h>
#include <stdio.h>
diff --git a/src/plugins/dpdk/device/init.c b/src/plugins/dpdk/device/init.c
index a795ba0..e23542f 100755
--- a/src/plugins/dpdk/device/init.c
+++ b/src/plugins/dpdk/device/init.c
@@ -17,10 +17,10 @@
#include <vppinfra/error.h>
#include <vppinfra/format.h>
#include <vppinfra/bitmap.h>
+#include <vlib/unix/unix.h>
#include <vnet/ethernet/ethernet.h>
#include <dpdk/device/dpdk.h>
-#include <vlib/unix/physmem.h>
#include <vlib/pci/pci.h>
#include <stdio.h>
@@ -1026,21 +1026,28 @@
clib_bitmap_foreach (c, tm->cpu_socket_bitmap, (
{
int pages_avail, page_size, mem;
+ clib_error_t *e = 0;
vec_validate(mem_by_socket, c);
mem = mem_by_socket[c];
page_size = 1024;
- pages_avail = vlib_sysfs_get_free_hugepages(c, page_size * 1024);
+ e = vlib_sysfs_get_free_hugepages(c, page_size * 1024, &pages_avail);
- if (pages_avail < 0 || page_size * pages_avail < mem)
+ if (e != 0 || pages_avail < 0 || page_size * pages_avail < mem)
use_1g = 0;
- page_size = 2;
- pages_avail = vlib_sysfs_get_free_hugepages(c, page_size * 1024);
+ if (e)
+ clib_error_free (e);
- if (pages_avail < 0 || page_size * pages_avail < mem)
+ page_size = 2;
+ e = vlib_sysfs_get_free_hugepages(c, page_size * 1024, &pages_avail);
+
+ if (e != 0 || pages_avail < 0 || page_size * pages_avail < mem)
use_2m = 0;
+
+ if (e)
+ clib_error_free (e);
}));
/* *INDENT-ON* */
diff --git a/src/plugins/dpdk/hqos/hqos.c b/src/plugins/dpdk/hqos/hqos.c
index 813eb91..c9b8565 100644
--- a/src/plugins/dpdk/hqos/hqos.c
+++ b/src/plugins/dpdk/hqos/hqos.c
@@ -29,7 +29,6 @@
#include <vnet/ethernet/ethernet.h>
#include <dpdk/device/dpdk.h>
-#include <vlib/unix/physmem.h>
#include <vlib/pci/pci.h>
#include <vlibmemory/api.h>
#include <vlibmemory/vl_memory_msg_enum.h> /* enumerate all vlib messages */
diff --git a/src/plugins/ixge/ixge.c b/src/plugins/ixge/ixge.c
index e0150f4..222c148 100644
--- a/src/plugins/ixge/ixge.c
+++ b/src/plugins/ixge/ixge.c
@@ -2493,10 +2493,11 @@
round_pow2 (xm->n_descriptors[rt], xm->n_descriptors_per_cache_line);
dq->head_index = dq->tail_index = 0;
- dq->descriptors = vlib_physmem_alloc_aligned (vm, &error,
- dq->n_descriptors *
- sizeof (dq->descriptors[0]),
- 128 /* per chip spec */ );
+ dq->descriptors =
+ vlib_physmem_alloc_aligned (vm, xm->physmem_region, &error,
+ dq->n_descriptors *
+ sizeof (dq->descriptors[0]),
+ 128 /* per chip spec */ );
if (error)
return error;
@@ -2518,7 +2519,8 @@
vlib_buffer_t *b =
vlib_get_buffer (vm, dq->descriptor_buffer_indices[i]);
dq->descriptors[i].rx_to_hw.tail_address =
- vlib_physmem_virtual_to_physical (vm, b->data);
+ vlib_physmem_virtual_to_physical (vm, xm->physmem_region,
+ b->data);
}
}
else
@@ -2526,7 +2528,8 @@
u32 i;
dq->tx.head_index_write_back =
- vlib_physmem_alloc (vm, &error, CLIB_CACHE_LINE_BYTES);
+ vlib_physmem_alloc (vm, vm->buffer_main->physmem_region, &error,
+ CLIB_CACHE_LINE_BYTES);
for (i = 0; i < dq->n_descriptors; i++)
dq->descriptors[i].tx = xm->tx_descriptor_template;
@@ -2538,7 +2541,9 @@
ixge_dma_regs_t *dr = get_dma_regs (xd, rt, queue_index);
u64 a;
- a = vlib_physmem_virtual_to_physical (vm, dq->descriptors);
+ a =
+ vlib_physmem_virtual_to_physical (vm, vm->buffer_main->physmem_region,
+ dq->descriptors);
dr->descriptor_address[0] = a & 0xFFFFFFFF;
dr->descriptor_address[1] = a >> (u64) 32;
dr->n_descriptor_bytes = dq->n_descriptors * sizeof (dq->descriptors[0]);
@@ -2564,7 +2569,9 @@
dq->tx.head_index_write_back[0] = dq->head_index;
a =
- vlib_physmem_virtual_to_physical (vm, dq->tx.head_index_write_back);
+ vlib_physmem_virtual_to_physical (vm,
+ vm->buffer_main->physmem_region,
+ dq->tx.head_index_write_back);
dr->tx.head_index_write_back_address[0] = /* enable bit */ 1 | a;
dr->tx.head_index_write_back_address[1] = (u64) a >> (u64) 32;
}
@@ -2850,9 +2857,12 @@
void *r;
ixge_device_t *xd;
- /* Device found: make sure we have dma memory. */
- if (unix_physmem_is_fake (vm))
- return clib_error_return (0, "no physical memory available");
+ /* Allocate physmem region for DMA buffers */
+ error = vlib_physmem_region_alloc (vm, "ixge decriptors", 2 << 20, 0,
+ VLIB_PHYSMEM_F_INIT_MHEAP,
+ &xm->physmem_region);
+ if (error)
+ return error;
error = vlib_pci_map_resource (dev, 0, &r);
if (error)
diff --git a/src/plugins/ixge/ixge.h b/src/plugins/ixge/ixge.h
index 779603b..42c1bfa 100644
--- a/src/plugins/ixge/ixge.h
+++ b/src/plugins/ixge/ixge.h
@@ -1266,6 +1266,8 @@
u32 *rx_buffers_to_add;
f64 time_last_stats_update;
+
+ vlib_physmem_region_index_t physmem_region;
} ixge_main_t;
ixge_main_t ixge_main;
diff --git a/src/vlib.am b/src/vlib.am
index 111dcfa..cab90e2 100644
--- a/src/vlib.am
+++ b/src/vlib.am
@@ -13,7 +13,7 @@
lib_LTLIBRARIES += libvlib.la
-libvlib_la_LIBADD = libvppinfra.la -ldl -lpthread
+libvlib_la_LIBADD = libvppinfra.la -ldl -lpthread -lnuma
libvlib_la_DEPENDENCIES = libvppinfra.la
BUILT_SOURCES += vlib/config.h
@@ -65,6 +65,7 @@
vlib/physmem.h \
vlib/pci/pci.h \
vlib/pci/pci_config.h \
+ vlib/physmem_funcs.h \
vlib/threads.h \
vlib/trace_funcs.h \
vlib/trace.h \
@@ -84,7 +85,6 @@
nobase_include_HEADERS += \
vlib/unix/cj.h \
vlib/unix/mc_socket.h \
- vlib/unix/physmem.h \
vlib/unix/plugin.h \
vlib/unix/unix.h
diff --git a/src/vlib/buffer.c b/src/vlib/buffer.c
index 908368c..a5ec0e0 100644
--- a/src/vlib/buffer.c
+++ b/src/vlib/buffer.c
@@ -47,6 +47,7 @@
#include <vlib/unix/unix.h>
vlib_buffer_callbacks_t *vlib_buffer_callbacks = 0;
+static u32 vlib_buffer_physmem_sz = 32 << 20;
uword
vlib_buffer_length_in_chain_slow_path (vlib_main_t * vm,
@@ -461,7 +462,8 @@
u32 i;
for (i = 0; i < vec_len (f->buffer_memory_allocated); i++)
- vm->os_physmem_free (f->buffer_memory_allocated[i]);
+ vm->os_physmem_free (vm, vm->buffer_main->physmem_region,
+ f->buffer_memory_allocated[i]);
vec_free (f->name);
vec_free (f->buffer_memory_allocated);
vec_free (f->buffers);
@@ -552,9 +554,9 @@
n_bytes = n_this_chunk * (sizeof (b[0]) + fl->n_data_bytes);
/* drb: removed power-of-2 ASSERT */
- buffers = vm->os_physmem_alloc_aligned (&vm->physmem_main,
- n_bytes,
- sizeof (vlib_buffer_t));
+ buffers =
+ vm->os_physmem_alloc_aligned (vm, vm->buffer_main->physmem_region,
+ n_bytes, sizeof (vlib_buffer_t));
if (!buffers)
return n_alloc;
@@ -1051,10 +1053,25 @@
};
/* *INDENT-ON* */
-void
-vlib_buffer_cb_init (struct vlib_main_t *vm)
+clib_error_t *
+vlib_buffer_main_init (struct vlib_main_t * vm)
{
- vlib_buffer_main_t *bm = vm->buffer_main;
+ vlib_buffer_main_t *bm;
+ clib_error_t *error;
+
+ vec_validate (vm->buffer_main, 0);
+ bm = vm->buffer_main;
+
+ if (vlib_buffer_callbacks)
+ {
+ /* external plugin has registered own buffer callbacks
+ so we just copy them and quit */
+ vlib_buffer_main_t *bm = vm->buffer_main;
+ clib_memcpy (&bm->cb, vlib_buffer_callbacks,
+ sizeof (vlib_buffer_callbacks_t));
+ bm->callbacks_registered = 1;
+ return 0;
+ }
bm->cb.vlib_buffer_alloc_cb = &vlib_buffer_alloc_internal;
bm->cb.vlib_buffer_alloc_from_free_list_cb =
@@ -1064,8 +1081,49 @@
bm->cb.vlib_buffer_delete_free_list_cb =
&vlib_buffer_delete_free_list_internal;
clib_spinlock_init (&bm->buffer_known_hash_lockp);
+
+ /* allocate default region */
+ error = vlib_physmem_region_alloc (vm, "buffers",
+ vlib_buffer_physmem_sz, 0,
+ VLIB_PHYSMEM_F_INIT_MHEAP |
+ VLIB_PHYSMEM_F_HAVE_BUFFERS,
+ &bm->physmem_region);
+
+ if (error == 0)
+ return 0;
+
+ clib_error_free (error);
+
+ /* we my be running unpriviledged, so try to allocate fake physmem */
+ error = vlib_physmem_region_alloc (vm, "buffers (fake)",
+ vlib_buffer_physmem_sz, 0,
+ VLIB_PHYSMEM_F_FAKE |
+ VLIB_PHYSMEM_F_INIT_MHEAP |
+ VLIB_PHYSMEM_F_HAVE_BUFFERS,
+ &bm->physmem_region);
+ return error;
}
+static clib_error_t *
+vlib_buffers_configure (vlib_main_t * vm, unformat_input_t * input)
+{
+ u32 size_in_mb;
+
+ while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT)
+ {
+ if (unformat (input, "memory-size-in-mb %d", &size_in_mb))
+ vlib_buffer_physmem_sz = size_in_mb << 20;
+ else
+ return unformat_parse_error (input);
+ }
+
+ unformat_free (input);
+ return 0;
+}
+
+VLIB_EARLY_CONFIG_FUNCTION (vlib_buffers_configure, "buffers");
+
+
/** @endcond */
/*
* fd.io coding-style-patch-verification: ON
diff --git a/src/vlib/buffer.h b/src/vlib/buffer.h
index 5504bf7..e47dbc6 100644
--- a/src/vlib/buffer.h
+++ b/src/vlib/buffer.h
@@ -408,6 +408,7 @@
buffer index */
uword buffer_mem_start;
uword buffer_mem_size;
+ vlib_physmem_region_index_t physmem_region;
/* Buffer free callback, for subversive activities */
u32 (*buffer_free_callback) (struct vlib_main_t * vm,
@@ -442,7 +443,7 @@
void vlib_buffer_add_mem_range (struct vlib_main_t *vm, uword start,
uword size);
-void vlib_buffer_cb_init (struct vlib_main_t *vm);
+clib_error_t *vlib_buffer_main_init (struct vlib_main_t *vm);
typedef struct
{
diff --git a/src/vlib/buffer_funcs.h b/src/vlib/buffer_funcs.h
index 78bf931..d51de6b 100644
--- a/src/vlib/buffer_funcs.h
+++ b/src/vlib/buffer_funcs.h
@@ -162,7 +162,7 @@
always_inline u64
vlib_get_buffer_data_physical_address (vlib_main_t * vm, u32 buffer_index)
{
- return vlib_physmem_offset_to_physical (&vm->physmem_main,
+ return vlib_physmem_offset_to_physical (vm, vm->buffer_main->physmem_region,
(((uword) buffer_index) <<
CLIB_LOG2_CACHE_LINE_BYTES) +
STRUCT_OFFSET_OF (vlib_buffer_t,
@@ -455,43 +455,6 @@
}
}
-always_inline void *
-vlib_physmem_alloc_aligned (vlib_main_t * vm, clib_error_t ** error,
- uword n_bytes, uword alignment)
-{
- void *r =
- vm->os_physmem_alloc_aligned (&vm->physmem_main, n_bytes, alignment);
- if (!r)
- *error =
- clib_error_return (0, "failed to allocate %wd bytes of I/O memory",
- n_bytes);
- else
- *error = 0;
- return r;
-}
-
-/* By default allocate I/O memory with cache line alignment. */
-always_inline void *
-vlib_physmem_alloc (vlib_main_t * vm, clib_error_t ** error, uword n_bytes)
-{
- return vlib_physmem_alloc_aligned (vm, error, n_bytes,
- CLIB_CACHE_LINE_BYTES);
-}
-
-always_inline void
-vlib_physmem_free (vlib_main_t * vm, void *mem)
-{
- return vm->os_physmem_free (mem);
-}
-
-always_inline u64
-vlib_physmem_virtual_to_physical (vlib_main_t * vm, void *mem)
-{
- vlib_physmem_main_t *pm = &vm->physmem_main;
- uword o = pointer_to_uword (mem) - pm->virtual.start;
- return vlib_physmem_offset_to_physical (pm, o);
-}
-
/* Append given data to end of buffer, possibly allocating new buffers. */
u32 vlib_buffer_add_data (vlib_main_t * vm,
u32 free_list_index,
diff --git a/src/vlib/main.c b/src/vlib/main.c
index 5d99e89..7875f62 100644
--- a/src/vlib/main.c
+++ b/src/vlib/main.c
@@ -1705,22 +1705,16 @@
if (!vm->name)
vm->name = "VLIB";
- vec_validate (vm->buffer_main, 0);
- if (vlib_buffer_callbacks)
+ if ((error = unix_physmem_init (vm)))
{
- /* external plugin has registered own buffer callbacks
- so we just copy them */
- vlib_buffer_main_t *bm = vm->buffer_main;
- clib_memcpy (&bm->cb, vlib_buffer_callbacks,
- sizeof (vlib_buffer_callbacks_t));
- bm->callbacks_registered = 1;
+ clib_error_report (error);
+ goto done;
}
- else
+
+ if ((error = vlib_buffer_main_init (vm)))
{
- vlib_physmem_main_t *vpm = &vm->physmem_main;
- vlib_buffer_cb_init (vm);
- unix_physmem_init (vm, 0 /* fail_if_physical_memory_not_present */ );
- vlib_buffer_add_mem_range (vm, vpm->virtual.start, vpm->virtual.size);
+ clib_error_report (error);
+ goto done;
}
if ((error = vlib_thread_init (vm)))
diff --git a/src/vlib/main.h b/src/vlib/main.h
index b63c63f..4c0cde3 100644
--- a/src/vlib/main.h
+++ b/src/vlib/main.h
@@ -107,9 +107,21 @@
/* Allocate/free buffer memory for DMA transfers, descriptor rings, etc.
buffer memory is guaranteed to be cache-aligned. */
- void *(*os_physmem_alloc_aligned) (vlib_physmem_main_t * pm,
+
+ clib_error_t *(*os_physmem_region_alloc) (struct vlib_main_t * vm,
+ char *name, u32 size,
+ u8 numa_node, u32 flags,
+ vlib_physmem_region_index_t *
+ idx);
+
+ void (*os_physmem_region_free) (struct vlib_main_t * vm,
+ vlib_physmem_region_index_t idx);
+
+ void *(*os_physmem_alloc_aligned) (struct vlib_main_t * vm,
+ vlib_physmem_region_index_t idx,
uword n_bytes, uword alignment);
- void (*os_physmem_free) (void *x);
+ void (*os_physmem_free) (struct vlib_main_t * vm,
+ vlib_physmem_region_index_t idx, void *x);
/* Node graph main structure. */
vlib_node_main_t node_main;
diff --git a/src/vlib/physmem.h b/src/vlib/physmem.h
index 9e7d52a..a7fed12 100644
--- a/src/vlib/physmem.h
+++ b/src/vlib/physmem.h
@@ -40,63 +40,36 @@
#ifndef included_vlib_physmem_h
#define included_vlib_physmem_h
+typedef u8 vlib_physmem_region_index_t;
+
typedef struct
{
- uword start, end, size;
+ vlib_physmem_region_index_t index;
+ void *mem;
+ uword size;
+ int fd;
+ u8 log2_page_size;
+ u16 n_pages;
+ u32 page_mask;
+
+ void *heap;
+ u32 flags;
+#define VLIB_PHYSMEM_F_INIT_MHEAP (1<<0)
+#define VLIB_PHYSMEM_F_HAVE_BUFFERS (1<<1)
+#define VLIB_PHYSMEM_F_FAKE (1<<2)
+
+ u8 numa_node;
+ u64 *page_table;
+ u8 *name;
} vlib_physmem_region_t;
+
+
typedef struct
{
- vlib_physmem_region_t virtual;
-
- uword log2_n_bytes_per_page;
-
- /* 1 << log2_n_bytes_per_page - 1. */
- uword page_mask;
-
- u64 *page_table;
-
- /* is fake physmem */
- u8 is_fake;
+ vlib_physmem_region_t *regions;
} vlib_physmem_main_t;
-always_inline u64
-vlib_physmem_offset_to_physical (vlib_physmem_main_t * pm, uword o)
-{
- uword page_index = o >> pm->log2_n_bytes_per_page;
- ASSERT (o < pm->virtual.size);
- ASSERT (pm->page_table[page_index] != 0);
- return (vec_elt (pm->page_table, page_index) + (o & pm->page_mask));
-}
-
-always_inline int
-vlib_physmem_is_virtual (vlib_physmem_main_t * pm, uword p)
-{
- return p >= pm->virtual.start && p < pm->virtual.end;
-}
-
-always_inline uword
-vlib_physmem_offset_of (vlib_physmem_main_t * pm, void *p)
-{
- uword a = pointer_to_uword (p);
- uword o;
-
- ASSERT (vlib_physmem_is_virtual (pm, a));
- o = a - pm->virtual.start;
-
- /* Offset must fit in 32 bits. */
- ASSERT ((uword) o == a - pm->virtual.start);
-
- return o;
-}
-
-always_inline void *
-vlib_physmem_at_offset (vlib_physmem_main_t * pm, uword offset)
-{
- ASSERT (offset < pm->virtual.size);
- return uword_to_pointer (pm->virtual.start + offset, void *);
-}
-
#endif /* included_vlib_physmem_h */
/*
diff --git a/src/vlib/physmem_funcs.h b/src/vlib/physmem_funcs.h
new file mode 100644
index 0000000..dbb8d9d
--- /dev/null
+++ b/src/vlib/physmem_funcs.h
@@ -0,0 +1,161 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * physmem.h: virtual <-> physical memory mapping for VLIB buffers
+ *
+ * Copyright (c) 2008 Eliot Dresselhaus
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef included_vlib_physmem_funcs_h
+#define included_vlib_physmem_funcs_h
+
+always_inline vlib_physmem_region_t *
+vlib_physmem_get_region (vlib_main_t * vm, u8 index)
+{
+ vlib_physmem_main_t *vpm = &vm->physmem_main;
+ return pool_elt_at_index (vpm->regions, index);
+}
+
+always_inline u64
+vlib_physmem_offset_to_physical (vlib_main_t * vm,
+ vlib_physmem_region_index_t idx, uword o)
+{
+ vlib_physmem_region_t *pr = vlib_physmem_get_region (vm, idx);
+ uword page_index = o >> pr->log2_page_size;
+ ASSERT (o < pr->size);
+ ASSERT (pr->page_table[page_index] != 0);
+ return (vec_elt (pr->page_table, page_index) + (o & pr->page_mask));
+}
+
+always_inline int
+vlib_physmem_is_virtual (vlib_main_t * vm, vlib_physmem_region_index_t idx,
+ uword p)
+{
+ vlib_physmem_region_t *pr = vlib_physmem_get_region (vm, idx);
+ return p >= pointer_to_uword (pr->mem)
+ && p < (pointer_to_uword (pr->mem) + pr->size);
+}
+
+always_inline uword
+vlib_physmem_offset_of (vlib_main_t * vm, vlib_physmem_region_index_t idx,
+ void *p)
+{
+ vlib_physmem_region_t *pr = vlib_physmem_get_region (vm, idx);
+ uword a = pointer_to_uword (p);
+ uword o;
+
+ ASSERT (vlib_physmem_is_virtual (vm, idx, a));
+ o = a - pointer_to_uword (pr->mem);
+
+ /* Offset must fit in 32 bits. */
+ ASSERT ((uword) o == a - pointer_to_uword (pr->mem));
+
+ return o;
+}
+
+always_inline void *
+vlib_physmem_at_offset (vlib_main_t * vm, vlib_physmem_region_index_t idx,
+ uword offset)
+{
+ vlib_physmem_region_t *pr = vlib_physmem_get_region (vm, idx);
+ ASSERT (offset < pr->size);
+ return uword_to_pointer (pointer_to_uword (pr->mem) + offset, void *);
+}
+
+always_inline void *
+vlib_physmem_alloc_aligned (vlib_main_t * vm, vlib_physmem_region_index_t idx,
+ clib_error_t ** error,
+ uword n_bytes, uword alignment)
+{
+ void *r = vm->os_physmem_alloc_aligned (vm, idx, n_bytes, alignment);
+ if (!r)
+ *error =
+ clib_error_return (0, "failed to allocate %wd bytes of I/O memory",
+ n_bytes);
+ else
+ *error = 0;
+ return r;
+}
+
+/* By default allocate I/O memory with cache line alignment. */
+always_inline void *
+vlib_physmem_alloc (vlib_main_t * vm, vlib_physmem_region_index_t idx,
+ clib_error_t ** error, uword n_bytes)
+{
+ return vlib_physmem_alloc_aligned (vm, idx, error, n_bytes,
+ CLIB_CACHE_LINE_BYTES);
+}
+
+always_inline void
+vlib_physmem_free (vlib_main_t * vm, vlib_physmem_region_index_t idx,
+ void *mem)
+{
+ return vm->os_physmem_free (vm, idx, mem);
+}
+
+always_inline u64
+vlib_physmem_virtual_to_physical (vlib_main_t * vm,
+ vlib_physmem_region_index_t idx, void *mem)
+{
+ vlib_physmem_main_t *vpm = &vm->physmem_main;
+ vlib_physmem_region_t *pr = pool_elt_at_index (vpm->regions, idx);
+ uword o = mem - pr->mem;
+ return vlib_physmem_offset_to_physical (vm, idx, o);
+}
+
+
+always_inline clib_error_t *
+vlib_physmem_region_alloc (vlib_main_t * vm, char *name, u32 size,
+ u8 numa_node, u32 flags,
+ vlib_physmem_region_index_t * idx)
+{
+ return vm->os_physmem_region_alloc (vm, name, size, numa_node, flags, idx);
+}
+
+always_inline void
+vlib_physmem_region_free (struct vlib_main_t *vm,
+ vlib_physmem_region_index_t idx)
+{
+ vm->os_physmem_region_free (vm, idx);
+}
+
+#endif /* included_vlib_physmem_funcs_h */
+
+/*
+ * fd.io coding-style-patch-verification: ON
+ *
+ * Local Variables:
+ * eval: (c-set-style "gnu")
+ * End:
+ */
diff --git a/src/vlib/unix/physmem.c b/src/vlib/unix/physmem.c
index 27a5bac..d5d5d6c 100644
--- a/src/vlib/unix/physmem.c
+++ b/src/vlib/unix/physmem.c
@@ -37,24 +37,66 @@
* WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
-#include <vlib/unix/physmem.h>
+#include <unistd.h>
+#include <sys/types.h>
+#include <sys/mount.h>
+#include <sys/mman.h>
+#include <sys/fcntl.h>
+#include <sys/stat.h>
+#include <numa.h>
+#include <numaif.h>
-static physmem_main_t physmem_main;
+#include <vlib/vlib.h>
+#include <vlib/physmem.h>
+#include <vlib/unix/unix.h>
+
+#ifndef __NR_memfd_create
+#if defined __x86_64__
+#define __NR_memfd_create 319
+#elif defined __arm__
+#define __NR_memfd_create 385
+#elif defined __aarch64__
+#define __NR_memfd_create 279
+#else
+#error "__NR_memfd_create unknown for this architecture"
+#endif
+#endif
+
+static inline int
+memfd_create (const char *name, unsigned int flags)
+{
+ return syscall (__NR_memfd_create, name, flags);
+}
+
+#ifndef F_LINUX_SPECIFIC_BASE
+#define F_LINUX_SPECIFIC_BASE 1024
+#endif
+#define MFD_ALLOW_SEALING 0x0002U
+#define F_ADD_SEALS (F_LINUX_SPECIFIC_BASE + 9)
+#define F_GET_SEALS (F_LINUX_SPECIFIC_BASE + 10)
+
+#define F_SEAL_SEAL 0x0001 /* prevent further seals from being set */
+#define F_SEAL_SHRINK 0x0002 /* prevent file from shrinking */
+#define F_SEAL_GROW 0x0004 /* prevent file from growing */
+#define F_SEAL_WRITE 0x0008 /* prevent writes */
static void *
-unix_physmem_alloc_aligned (vlib_physmem_main_t * vpm, uword n_bytes,
- uword alignment)
+unix_physmem_alloc_aligned (vlib_main_t * vm, vlib_physmem_region_index_t idx,
+ uword n_bytes, uword alignment)
{
- physmem_main_t *pm = &physmem_main;
+ vlib_physmem_region_t *pr = vlib_physmem_get_region (vm, idx);
uword lo_offset, hi_offset;
uword *to_free = 0;
+ if (pr->heap == 0)
+ return 0;
+
/* IO memory is always at least cache aligned. */
alignment = clib_max (alignment, CLIB_CACHE_LINE_BYTES);
while (1)
{
- mheap_get_aligned (pm->heap, n_bytes,
+ mheap_get_aligned (pr->heap, n_bytes,
/* align */ alignment,
/* align offset */ 0,
&lo_offset);
@@ -63,11 +105,14 @@
if (lo_offset == ~0)
break;
+ if (pr->flags & VLIB_PHYSMEM_F_FAKE)
+ break;
+
/* Make sure allocation does not span DMA physical chunk boundary. */
hi_offset = lo_offset + n_bytes - 1;
- if ((lo_offset >> vpm->log2_n_bytes_per_page) ==
- (hi_offset >> vpm->log2_n_bytes_per_page))
+ if ((lo_offset >> pr->log2_page_size) ==
+ (hi_offset >> pr->log2_page_size))
break;
/* Allocation would span chunk boundary, queue it to be freed as soon as
@@ -79,134 +124,267 @@
{
uword i;
for (i = 0; i < vec_len (to_free); i++)
- mheap_put (pm->heap, to_free[i]);
+ mheap_put (pr->heap, to_free[i]);
vec_free (to_free);
}
- return lo_offset != ~0 ? pm->heap + lo_offset : 0;
+ return lo_offset != ~0 ? pr->heap + lo_offset : 0;
}
static void
-unix_physmem_free (void *x)
+unix_physmem_free (vlib_main_t * vm, vlib_physmem_region_index_t idx, void *x)
{
- physmem_main_t *pm = &physmem_main;
-
+ vlib_physmem_region_t *pr = vlib_physmem_get_region (vm, idx);
/* Return object to region's heap. */
- mheap_put (pm->heap, x - pm->heap);
+ mheap_put (pr->heap, x - pr->heap);
}
-static void
-htlb_shutdown (void)
+static u64
+get_page_paddr (int fd, uword addr)
{
- physmem_main_t *pm = &physmem_main;
+ int pagesize = sysconf (_SC_PAGESIZE);
+ u64 seek, pagemap = 0;
- if (!pm->shmid)
- return;
- shmctl (pm->shmid, IPC_RMID, 0);
- pm->shmid = 0;
+ seek = ((u64) addr / pagesize) * sizeof (u64);
+ if (lseek (fd, seek, SEEK_SET) != seek)
+ {
+ clib_unix_warning ("lseek to 0x%llx", seek);
+ return 0;
+ }
+ if (read (fd, &pagemap, sizeof (pagemap)) != (sizeof (pagemap)))
+ {
+ clib_unix_warning ("read ptbits");
+ return 0;
+ }
+ if ((pagemap & (1ULL << 63)) == 0)
+ return 0;
+
+ pagemap &= pow2_mask (55);
+
+ return pagemap * pagesize;
}
-/* try to use huge TLB pgs if possible */
-static int
-htlb_init (vlib_main_t * vm)
+static clib_error_t *
+unix_physmem_region_alloc (vlib_main_t * vm, char *name, u32 size,
+ u8 numa_node, u32 flags,
+ vlib_physmem_region_index_t * idx)
{
vlib_physmem_main_t *vpm = &vm->physmem_main;
- physmem_main_t *pm = &physmem_main;
- u64 hugepagesize, pagesize;
- u64 pfn, seek_loc;
- u64 cur, physaddr, ptbits;
- int fd, i;
+ vlib_physmem_region_t *pr;
+ clib_error_t *error = 0;
+ int pagemap_fd = -1;
+ u8 *mount_dir = 0;
+ u8 *filename = 0;
+ struct stat st;
+ int old_mpol;
+ int mmap_flags;
+ struct bitmask *old_mask = numa_allocate_nodemask ();
- pm->shmid = shmget (11 /* key, my amp goes to 11 */ , pm->mem_size,
- IPC_CREAT | SHM_HUGETLB | SHM_R | SHM_W);
- if (pm->shmid < 0)
+ if (geteuid () != 0 && (flags & VLIB_PHYSMEM_F_FAKE) == 0)
+ return clib_error_return (0, "not allowed");
+
+ pool_get (vpm->regions, pr);
+
+ if ((pr - vpm->regions) >= 256)
{
- clib_unix_warning ("shmget");
- return 0;
+ error = clib_error_return (0, "maximum number of regions reached");
+ goto error;
}
- pm->mem = shmat (pm->shmid, NULL, 0 /* flags */ );
- if (pm->mem == 0)
+ pr->index = pr - vpm->regions;
+ pr->fd = -1;
+ pr->flags = flags;
+
+ if (get_mempolicy (&old_mpol, old_mask->maskp, old_mask->size + 1, NULL, 0)
+ == -1)
{
- shmctl (pm->shmid, IPC_RMID, 0);
- return 0;
+ error = clib_error_return_unix (0, "get_mempolicy");
+ goto error;
}
- memset (pm->mem, 0, pm->mem_size);
-
- /* $$$ get page size info from /proc/meminfo */
- hugepagesize = 2 << 20;
- pagesize = 4 << 10;
- vpm->log2_n_bytes_per_page = min_log2 (hugepagesize);
- vec_resize (vpm->page_table, pm->mem_size / hugepagesize);
-
- vpm->page_mask = pow2_mask (vpm->log2_n_bytes_per_page);
- vpm->virtual.start = pointer_to_uword (pm->mem);
- vpm->virtual.size = pm->mem_size;
- vpm->virtual.end = vpm->virtual.start + vpm->virtual.size;
-
- fd = open ("/proc/self/pagemap", O_RDONLY);
-
- if (fd < 0)
+ if ((flags & VLIB_PHYSMEM_F_FAKE) == 0)
{
- (void) shmdt (pm->mem);
- return 0;
- }
-
- pm->heap = mheap_alloc_with_flags (pm->mem, pm->mem_size,
- /* Don't want mheap mmap/munmap with IO memory. */
- MHEAP_FLAG_DISABLE_VM |
- MHEAP_FLAG_THREAD_SAFE);
-
- cur = pointer_to_uword (pm->mem);
- i = 0;
-
- while (cur < pointer_to_uword (pm->mem) + pm->mem_size)
- {
- pfn = (u64) cur / pagesize;
- seek_loc = pfn * sizeof (u64);
- if (lseek (fd, seek_loc, SEEK_SET) != seek_loc)
+ if ((pagemap_fd = open ((char *) "/proc/self/pagemap", O_RDONLY)) == -1)
{
- clib_unix_warning ("lseek to 0x%llx", seek_loc);
- shmctl (pm->shmid, IPC_RMID, 0);
- close (fd);
- return 0;
- }
- if (read (fd, &ptbits, sizeof (ptbits)) != (sizeof (ptbits)))
- {
- clib_unix_warning ("read ptbits");
- shmctl (pm->shmid, IPC_RMID, 0);
- close (fd);
- return 0;
+ error = clib_error_return_unix (0, "open '/proc/self/pagemap'");
+ goto error;
}
- /* bits 0-54 are the physical page number */
- physaddr = (ptbits & 0x7fffffffffffffULL) * pagesize;
- if (CLIB_DEBUG > 1)
- fformat (stderr, "pm: virtual 0x%llx physical 0x%llx\n",
- cur, physaddr);
- vpm->page_table[i++] = physaddr;
+ mount_dir = format (0, "%s/physmem_region%d%c",
+ vlib_unix_get_runtime_dir (), pr->index, 0);
+ filename = format (0, "%s/mem%c", mount_dir, 0);
- cur += hugepagesize;
+ unlink ((char *) mount_dir);
+
+ error = vlib_unix_recursive_mkdir ((char *) mount_dir);
+ if (error)
+ goto error;
+
+ if (mount ("none", (char *) mount_dir, "hugetlbfs", 0, NULL))
+ {
+ error = clib_error_return_unix (0, "mount hugetlb directory '%s'",
+ mount_dir);
+ goto error;
+ }
+
+ if ((pr->fd = open ((char *) filename, O_CREAT | O_RDWR, 0755)) == -1)
+ {
+ error = clib_error_return_unix (0, "open");
+ goto error;
+ }
+
+ mmap_flags = MAP_SHARED | MAP_HUGETLB | MAP_LOCKED;
}
- close (fd);
- atexit (htlb_shutdown);
- return 1;
+ else
+ {
+ if ((pr->fd = memfd_create (name, MFD_ALLOW_SEALING)) == -1)
+ return clib_error_return_unix (0, "memfd_create");
+
+ if ((fcntl (pr->fd, F_ADD_SEALS, F_SEAL_SHRINK)) == -1)
+ {
+ error =
+ clib_error_return_unix (0, "fcntl (F_ADD_SEALS, F_SEAL_SHRINK)");
+ goto error;
+ }
+ mmap_flags = MAP_SHARED;
+ }
+
+ if (fstat (pr->fd, &st))
+ {
+ error = clib_error_return_unix (0, "fstat");
+ goto error;
+ }
+
+ pr->log2_page_size = min_log2 (st.st_blksize);
+ pr->n_pages = ((size - 1) >> pr->log2_page_size) + 1;
+ size = pr->n_pages * (1 << pr->log2_page_size);
+
+ if ((ftruncate (pr->fd, size)) == -1)
+ {
+ error = clib_error_return_unix (0, "ftruncate length: %d", size);
+ goto error;
+ }
+
+ if ((flags & VLIB_PHYSMEM_F_FAKE) == 0)
+ {
+ error = vlib_sysfs_prealloc_hugepages (numa_node,
+ 1 << (pr->log2_page_size - 10),
+ pr->n_pages);
+ if (error)
+ goto error;
+ }
+
+ numa_set_preferred (numa_node);
+
+ pr->mem = mmap (0, size, (PROT_READ | PROT_WRITE), mmap_flags, pr->fd, 0);
+
+ if (pr->mem == MAP_FAILED)
+ {
+ pr->mem = 0;
+ error = clib_error_return_unix (0, "mmap");
+ goto error;
+ }
+
+ if (set_mempolicy (old_mpol, old_mask->maskp, old_mask->size + 1) == -1)
+ {
+ error = clib_error_return_unix (0, "set_mempolicy");
+ goto error;
+ }
+
+ pr->size = pr->n_pages << pr->log2_page_size;
+ pr->page_mask = (1 << pr->log2_page_size) - 1;
+ pr->numa_node = numa_node;
+ pr->name = format (0, "%s", name);
+
+ if ((flags & VLIB_PHYSMEM_F_FAKE) == 0)
+ {
+ int i;
+ for (i = 0; i < pr->n_pages; i++)
+ {
+ void *ptr = pr->mem + (i << pr->log2_page_size);
+ int node;
+ move_pages (0, 1, &ptr, 0, &node, 0);
+ if (numa_node != node)
+ {
+ clib_warning
+ ("physmem page for region \'%s\' allocated on the wrong"
+ " numa node (requested %u actual %u)", pr->name,
+ pr->numa_node, node, i);
+ break;
+ }
+ }
+ }
+
+ if (flags & VLIB_PHYSMEM_F_INIT_MHEAP)
+ {
+ pr->heap = mheap_alloc_with_flags (pr->mem, pr->size,
+ /* Don't want mheap mmap/munmap with IO memory. */
+ MHEAP_FLAG_DISABLE_VM |
+ MHEAP_FLAG_THREAD_SAFE);
+ fformat (stdout, "%U", format_mheap, pr->heap, /* verbose */ 1);
+ }
+
+ if (flags & VLIB_PHYSMEM_F_HAVE_BUFFERS)
+ {
+ vlib_buffer_add_mem_range (vm, pointer_to_uword (pr->mem), pr->size);
+ }
+
+ *idx = pr->index;
+
+ if ((flags & VLIB_PHYSMEM_F_FAKE) == 0)
+ {
+ int i;
+ for (i = 0; i < pr->n_pages; i++)
+ {
+ uword vaddr =
+ pointer_to_uword (pr->mem) + (((u64) i) << pr->log2_page_size);
+ u64 page_paddr = get_page_paddr (pagemap_fd, vaddr);
+ vec_add1 (pr->page_table, page_paddr);
+ }
+ }
+
+ goto done;
+
+error:
+ if (pr->fd > -1)
+ close (pr->fd);
+
+ if (pr->mem)
+ munmap (pr->mem, size);
+
+ memset (pr, 0, sizeof (*pr));
+ pool_put (vpm->regions, pr);
+
+done:
+ if (mount_dir)
+ {
+ umount2 ((char *) mount_dir, MNT_DETACH);
+ rmdir ((char *) mount_dir);
+ vec_free (mount_dir);
+ }
+ numa_free_cpumask (old_mask);
+ vec_free (filename);
+ if (pagemap_fd > -1)
+ close (pagemap_fd);
+ return error;
}
-int vlib_app_physmem_init (vlib_main_t * vm,
- physmem_main_t * pm, int) __attribute__ ((weak));
-int
-vlib_app_physmem_init (vlib_main_t * vm, physmem_main_t * pm, int x)
+static void
+unix_physmem_region_free (vlib_main_t * vm, vlib_physmem_region_index_t idx)
{
- return 0;
+ vlib_physmem_main_t *vpm = &vm->physmem_main;
+ vlib_physmem_region_t *pr = vlib_physmem_get_region (vm, idx);
+
+ if (pr->fd > 0)
+ close (pr->fd);
+ munmap (pr->mem, pr->size);
+ vec_free (pr->name);
+ pool_put (vpm->regions, pr);
}
clib_error_t *
-unix_physmem_init (vlib_main_t * vm, int physical_memory_required)
+unix_physmem_init (vlib_main_t * vm)
{
- vlib_physmem_main_t *vpm = &vm->physmem_main;
- physmem_main_t *pm = &physmem_main;
clib_error_t *error = 0;
/* Avoid multiple calls. */
@@ -215,50 +393,9 @@
vm->os_physmem_alloc_aligned = unix_physmem_alloc_aligned;
vm->os_physmem_free = unix_physmem_free;
- pm->mem = MAP_FAILED;
+ vm->os_physmem_region_alloc = unix_physmem_region_alloc;
+ vm->os_physmem_region_free = unix_physmem_region_free;
- if (pm->mem_size == 0)
- pm->mem_size = 16 << 20;
-
- /* OK, Mr. App, you tell us */
- if (vlib_app_physmem_init (vm, pm, physical_memory_required))
- return 0;
-
- if (!pm->no_hugepages && htlb_init (vm))
- {
- fformat (stderr, "%s: use huge pages\n", __FUNCTION__);
- return 0;
- }
-
- pm->mem =
- mmap (0, pm->mem_size, PROT_READ | PROT_WRITE,
- MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
- if (pm->mem == MAP_FAILED)
- {
- error = clib_error_return_unix (0, "mmap");
- goto done;
- }
-
- pm->heap = mheap_alloc (pm->mem, pm->mem_size);
-
- /* Identity map with a single page. */
- vpm->log2_n_bytes_per_page = min_log2 (pm->mem_size);
- vec_add1 (vpm->page_table, pointer_to_uword (pm->mem));
-
- vpm->page_mask = pow2_mask (vpm->log2_n_bytes_per_page);
- vpm->virtual.start = pointer_to_uword (pm->mem);
- vpm->virtual.size = pm->mem_size;
- vpm->virtual.end = vpm->virtual.start + vpm->virtual.size;
- vpm->is_fake = 1;
-
- fformat (stderr, "%s: use fake dma pages\n", __FUNCTION__);
-
-done:
- if (error)
- {
- if (pm->mem != MAP_FAILED)
- munmap (pm->mem, pm->mem_size);
- }
return error;
}
@@ -266,12 +403,22 @@
show_physmem (vlib_main_t * vm,
unformat_input_t * input, vlib_cli_command_t * cmd)
{
- physmem_main_t *pm = &physmem_main;
+ vlib_physmem_main_t *vpm = &vm->physmem_main;
+ vlib_physmem_region_t *pr;
- if (pm->heap)
- vlib_cli_output (vm, "%U", format_mheap, pm->heap, /* verbose */ 1);
- else
- vlib_cli_output (vm, "No physmem allocated.");
+ /* *INDENT-OFF* */
+ pool_foreach (pr, vpm->regions, (
+ {
+ vlib_cli_output (vm, "index %u name '%s' page-size %uKB num-pages %d "
+ "numa-node %u fd %d\n",
+ pr->index, pr->name, (1 << (pr->log2_page_size -10)),
+ pr->n_pages, pr->numa_node, pr->fd);
+ if (pr->heap)
+ vlib_cli_output (vm, " %U", format_mheap, pr->heap, /* verbose */ 1);
+ else
+ vlib_cli_output (vm, " no heap\n");
+ }));
+ /* *INDENT-ON* */
return 0;
}
@@ -283,177 +430,6 @@
};
/* *INDENT-ON* */
-static clib_error_t *
-show_affinity (vlib_main_t * vm,
- unformat_input_t * input, vlib_cli_command_t * cmd)
-{
- cpu_set_t set;
- cpu_set_t *setp = &set;
- int i, rv;
- u8 *s = 0;
- int first_set_bit_in_run = -1;
- int last_set_bit_in_run = -1;
- int output_done = 0;
-
- rv = sched_getaffinity (0 /* pid, 0 = this proc */ ,
- sizeof (*setp), setp);
- if (rv < 0)
- {
- vlib_cli_output (vm, "Couldn't get affinity mask: %s\n",
- strerror (errno));
- return 0;
- }
-
- for (i = 0; i < 64; i++)
- {
- if (CPU_ISSET (i, setp))
- {
- if (first_set_bit_in_run == -1)
- {
- first_set_bit_in_run = i;
- last_set_bit_in_run = i;
- if (output_done)
- s = format (s, ",");
- s = format (s, "%d-", i);
- output_done = 1;
- }
- else
- {
- if (i == (last_set_bit_in_run + 1))
- last_set_bit_in_run = i;
- }
- }
- else
- {
- if (first_set_bit_in_run != -1)
- {
- if (first_set_bit_in_run == (i - 1))
- {
- _vec_len (s) -= 2 + ((first_set_bit_in_run / 10));
- }
- s = format (s, "%d", last_set_bit_in_run);
- first_set_bit_in_run = -1;
- last_set_bit_in_run = -1;
- }
- }
- }
-
- if (first_set_bit_in_run != -1)
- s = format (s, "%d", first_set_bit_in_run);
-
- vlib_cli_output (vm, "Process runs on: %v", s);
- return 0;
-}
-
-/* *INDENT-OFF* */
-VLIB_CLI_COMMAND (show_affinity_command, static) = {
- .path = "show affinity",
- .short_help = "Show process cpu affinity",
- .function = show_affinity,
-};
-/* *INDENT-ON* */
-
-static clib_error_t *
-set_affinity (vlib_main_t * vm,
- unformat_input_t * input, vlib_cli_command_t * cmd)
-{
- cpu_set_t set;
- cpu_set_t *setp = &set;
- int i, rv;
- int another_round;
- u32 first, last;
-
- memset (setp, 0, sizeof (*setp));
-
- do
- {
- another_round = 0;
- if (unformat (input, "%d-%d,", &first, &last))
- {
- if (first > 64 || last > 64)
- {
- barf1:
- vlib_cli_output (vm, "range %d-%d invalid", first, last);
- return 0;
- }
-
- for (i = first; i <= last; i++)
- CPU_SET (i, setp);
- another_round = 1;
- }
- else if (unformat (input, "%d-%d", &first, &last))
- {
- if (first > 64 || last > 64)
- goto barf1;
-
- for (i = first; i <= last; i++)
- CPU_SET (i, setp);
- }
- else if (unformat (input, "%d,", &first))
- {
- if (first > 64)
- {
- barf2:
- vlib_cli_output (vm, "cpu %d invalid", first);
- return 0;
- }
- CPU_SET (first, setp);
- another_round = 1;
- }
- else if (unformat (input, "%d", &first))
- {
- if (first > 64)
- goto barf2;
-
- CPU_SET (first, setp);
- }
- }
- while (another_round);
-
- rv = sched_setaffinity (0 /* pid, 0 = this proc */ ,
- sizeof (*setp), setp);
-
- if (rv < 0)
- {
- vlib_cli_output (vm, "Couldn't get affinity mask: %s\n",
- strerror (errno));
- return 0;
- }
- return show_affinity (vm, input, cmd);
-}
-
-/* *INDENT-OFF* */
-VLIB_CLI_COMMAND (set_affinity_command, static) = {
- .path = "set affinity",
- .short_help = "Set process cpu affinity",
- .function = set_affinity,
-};
-/* *INDENT-ON* */
-
-static clib_error_t *
-vlib_physmem_configure (vlib_main_t * vm, unformat_input_t * input)
-{
- physmem_main_t *pm = &physmem_main;
- u32 size_in_mb;
-
- while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT)
- {
- if (unformat (input, "no-huge") || unformat (input, "no-huge-pages"))
- pm->no_hugepages = 1;
-
- else if (unformat (input, "size-in-mb %d", &size_in_mb) ||
- unformat (input, "size %d", &size_in_mb))
- pm->mem_size = size_in_mb << 20;
- else
- return unformat_parse_error (input);
- }
-
- unformat_free (input);
- return 0;
-}
-
-VLIB_EARLY_CONFIG_FUNCTION (vlib_physmem_configure, "physmem");
-
/*
* fd.io coding-style-patch-verification: ON
*
diff --git a/src/vlib/unix/physmem.h b/src/vlib/unix/physmem.h
deleted file mode 100644
index 5519a7d..0000000
--- a/src/vlib/unix/physmem.h
+++ /dev/null
@@ -1,65 +0,0 @@
-/*
- * Copyright (c) 2015 Cisco and/or its affiliates.
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at:
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#ifndef __included_physmem_h__
-#define __included_physmem_h__
-
-/* Manage I/O physical memory. */
-#define _GNU_SOURCE
-#include <sched.h>
-#include <vppinfra/cache.h>
-#include <vppinfra/error.h>
-#include <vppinfra/mheap.h>
-#include <vppinfra/os.h>
-
-#include <vlib/vlib.h>
-#include <vlib/unix/unix.h>
-
-#include <sys/fcntl.h> /* for open */
-#include <sys/file.h> /* for flock */
-#include <sys/ioctl.h>
-#include <sys/types.h>
-#include <sys/stat.h>
-#include <fcntl.h>
-#include <sys/ipc.h>
-#include <sys/shm.h>
-
-typedef struct
-{
- /* Virtual memory via mmaped. */
- void *mem;
-
- /* Size in bytes. */
- uword mem_size;
-
- /* Heap allocated out of virtual memory. */
- void *heap;
-
- /* huge TLB segment id */
- int shmid;
-
- /* should we try to use htlb ? */
- int no_hugepages;
-
-} physmem_main_t;
-
-#endif /* __included_physmem_h__ */
-
-/*
- * fd.io coding-style-patch-verification: ON
- *
- * Local Variables:
- * eval: (c-set-style "gnu")
- * End:
- */
diff --git a/src/vlib/unix/unix.h b/src/vlib/unix/unix.h
index 97f5894..b5a3342 100644
--- a/src/vlib/unix/unix.h
+++ b/src/vlib/unix/unix.h
@@ -195,18 +195,7 @@
/* Main function for Unix VLIB. */
int vlib_unix_main (int argc, char *argv[]);
-/* Call to allocate/initialize physical DMA memory subsystem.
- This is not an init function so that users can explicitly enable/disable
- physmem when its not needed. */
-clib_error_t *unix_physmem_init (vlib_main_t * vm,
- int fail_if_physical_memory_not_present);
-
-static inline int
-unix_physmem_is_fake (vlib_main_t * vm)
-{
- vlib_physmem_main_t *vpm = &vm->physmem_main;
- return vpm->is_fake;
-}
+clib_error_t *unix_physmem_init (vlib_main_t * vm);
/* Set prompt for CLI. */
void vlib_unix_cli_set_prompt (char *prompt);
@@ -234,7 +223,16 @@
u8 *vlib_sysfs_link_to_name (char *link);
-int vlib_sysfs_get_free_hugepages (unsigned int numa_node, int page_size);
+clib_error_t *vlib_sysfs_set_nr_hugepages (unsigned int numa_node,
+ int page_size, int nr);
+clib_error_t *vlib_sysfs_get_nr_hugepages (unsigned int numa_node,
+ int page_size, int *v);
+clib_error_t *vlib_sysfs_get_free_hugepages (unsigned int numa_node,
+ int page_size, int *v);
+clib_error_t *vlib_sysfs_get_surplus_hugepages (unsigned int numa_node,
+ int page_size, int *v);
+clib_error_t *vlib_sysfs_prealloc_hugepages (unsigned int numa_node,
+ int page_size, int nr);
clib_error_t *foreach_directory_file (char *dir_name,
clib_error_t * (*f) (void *arg,
diff --git a/src/vlib/unix/util.c b/src/vlib/unix/util.c
index 312cc9b..0e252ac 100644
--- a/src/vlib/unix/util.c
+++ b/src/vlib/unix/util.c
@@ -189,37 +189,132 @@
return s;
}
-int
-vlib_sysfs_get_free_hugepages (unsigned int numa_node, int page_size)
+clib_error_t *
+vlib_sysfs_set_nr_hugepages (unsigned int numa_node, int page_size, int nr)
{
+ clib_error_t *error = 0;
struct stat sb;
u8 *p = 0;
- int r = -1;
p = format (p, "/sys/devices/system/node/node%u%c", numa_node, 0);
if (stat ((char *) p, &sb) == 0)
{
if (S_ISDIR (sb.st_mode) == 0)
- goto done;
+ {
+ error = clib_error_return (0, "'%s' is not directory", p);
+ goto done;
+ }
}
else if (numa_node == 0)
{
vec_reset_length (p);
p = format (p, "/sys/kernel/mm%c", 0);
if (stat ((char *) p, &sb) < 0 || S_ISDIR (sb.st_mode) == 0)
- goto done;
+ {
+ error = clib_error_return (0, "'%s' does not exist or it is not "
+ "directory", p);
+ goto done;
+ }
}
else
- goto done;
+ {
+ error = clib_error_return (0, "'%s' does not exist", p);
+ goto done;
+ }
_vec_len (p) -= 1;
- p = format (p, "/hugepages/hugepages-%ukB/free_hugepages%c", page_size, 0);
- vlib_sysfs_read ((char *) p, "%d", &r);
+ p = format (p, "/hugepages/hugepages-%ukB/nr_hugepages%c", page_size, 0);
+ vlib_sysfs_write ((char *) p, "%d", nr);
done:
vec_free (p);
- return r;
+ return error;
+}
+
+
+static clib_error_t *
+vlib_sysfs_get_xxx_hugepages (char *type, unsigned int numa_node,
+ int page_size, int *val)
+{
+ clib_error_t *error = 0;
+ struct stat sb;
+ u8 *p = 0;
+
+ p = format (p, "/sys/devices/system/node/node%u%c", numa_node, 0);
+
+ if (stat ((char *) p, &sb) == 0)
+ {
+ if (S_ISDIR (sb.st_mode) == 0)
+ {
+ error = clib_error_return (0, "'%s' is not directory", p);
+ goto done;
+ }
+ }
+ else if (numa_node == 0)
+ {
+ vec_reset_length (p);
+ p = format (p, "/sys/kernel/mm%c", 0);
+ if (stat ((char *) p, &sb) < 0 || S_ISDIR (sb.st_mode) == 0)
+ {
+ error = clib_error_return (0, "'%s' does not exist or it is not "
+ "directory", p);
+ goto done;
+ }
+ }
+ else
+ {
+ error = clib_error_return (0, "'%s' does not exist", p);
+ goto done;
+ }
+
+ _vec_len (p) -= 1;
+ p = format (p, "/hugepages/hugepages-%ukB/%s_hugepages%c", page_size,
+ type, 0);
+ error = vlib_sysfs_read ((char *) p, "%d", val);
+
+done:
+ vec_free (p);
+ return error;
+}
+
+clib_error_t *
+vlib_sysfs_get_free_hugepages (unsigned int numa_node, int page_size, int *v)
+{
+ return vlib_sysfs_get_xxx_hugepages ("free", numa_node, page_size, v);
+}
+
+clib_error_t *
+vlib_sysfs_get_nr_hugepages (unsigned int numa_node, int page_size, int *v)
+{
+ return vlib_sysfs_get_xxx_hugepages ("nr", numa_node, page_size, v);
+}
+
+clib_error_t *
+vlib_sysfs_get_surplus_hugepages (unsigned int numa_node, int page_size,
+ int *v)
+{
+ return vlib_sysfs_get_xxx_hugepages ("surplus", numa_node, page_size, v);
+}
+
+clib_error_t *
+vlib_sysfs_prealloc_hugepages (unsigned int numa_node, int page_size, int nr)
+{
+ clib_error_t *error = 0;
+ int n, needed;
+ error = vlib_sysfs_get_free_hugepages (numa_node, page_size, &n);
+ if (error)
+ return error;
+ needed = nr - n;
+ if (needed <= 0)
+ return 0;
+
+ error = vlib_sysfs_get_nr_hugepages (numa_node, page_size, &n);
+ if (error)
+ return error;
+ clib_warning ("pre-allocating %u additional %uK hugepages on numa node %u",
+ needed, page_size, numa_node);
+ return vlib_sysfs_set_nr_hugepages (numa_node, page_size, n + needed);
}
clib_error_t *
diff --git a/src/vlib/vlib.h b/src/vlib/vlib.h
index b146a49..eed5c5b 100644
--- a/src/vlib/vlib.h
+++ b/src/vlib/vlib.h
@@ -50,6 +50,7 @@
struct vlib_main_t;
/* All includes in alphabetical order. */
+#include <vlib/physmem.h>
#include <vlib/buffer.h>
#include <vlib/cli.h>
#include <vlib/counter.h>
@@ -57,7 +58,6 @@
#include <vlib/init.h>
#include <vlib/mc.h>
#include <vlib/node.h>
-#include <vlib/physmem.h>
#include <vlib/trace.h>
/* Main include depends on other vlib/ includes so we put it last. */
@@ -65,6 +65,7 @@
/* Inline/extern function declarations. */
#include <vlib/threads.h>
+#include <vlib/physmem_funcs.h>
#include <vlib/buffer_funcs.h>
#include <vlib/cli_funcs.h>
#include <vlib/error_funcs.h>