Repair vlib API socket server
- Teach vpp_api_test to send/receive API messages over sockets
- Add memfd-based shared memory
- Add api messages to create memfd-based shared memory segments
- vpp_api_test supports both socket and shared memory segment connections
- vpp_api_test pivot from socket to shared memory API messaging
- add socket client support to libvlibclient.so
- dead client reaper sends ping messages, container-friendly
- dead client reaper falls back to kill (<pid>, 0) live checking
if e.g. a python app goes silent for tens of seconds
- handle ping messages in python client support code
- teach show api ring about pairwise shared-memory segments
- fix ip probing of already resolved destinations (VPP-998)
We'll need this work to implement proper host-stack client isolation
Change-Id: Ic23b65f75c854d0393d9a2e9d6b122a9551be769
Signed-off-by: Dave Barach <dave@barachs.net>
Signed-off-by: Dave Wallace <dwallacelf@gmail.com>
Signed-off-by: Florin Coras <fcoras@cisco.com>
diff --git a/src/svm/memfd.c b/src/svm/memfd.c
new file mode 100644
index 0000000..9fe487d
--- /dev/null
+++ b/src/svm/memfd.c
@@ -0,0 +1,137 @@
+/*
+ * Copyright (c) 2017 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "memfd.h"
+
+int
+memfd_master_init (memfd_private_t * memfd, u32 master_index)
+{
+ int flags;
+ memfd_shared_header_t *sh;
+ u64 ticks = clib_cpu_time_now ();
+ u64 randomize_baseva;
+ void *oldheap;
+
+ if (memfd->memfd_size == 0)
+ return MEMFD_API_ERROR_NO_SIZE;
+
+ ASSERT (vec_c_string_is_terminated (memfd->name));
+ memfd->name = format (0, "memfd svm region %d", master_index);
+
+ memfd->fd = memfd_create ((char *) memfd->name, MFD_ALLOW_SEALING);
+ if (memfd->fd < 0)
+ {
+ clib_unix_warning ("create segment '%s'", memfd->name);
+ return MEMFD_API_ERROR_CREATE_FAILURE;
+ }
+
+ if ((ftruncate (memfd->fd, memfd->memfd_size)) == -1)
+ {
+ clib_unix_warning ("set memfd size");
+ return MEMFD_API_ERROR_SET_SIZE;
+ }
+
+ if ((fcntl (memfd->fd, F_ADD_SEALS, F_SEAL_SHRINK)) == -1)
+ clib_unix_warning ("fcntl (F_ADD_SEALS, F_SEAL_SHRINK)");
+
+ flags = MAP_SHARED;
+ if (memfd->requested_va)
+ flags |= MAP_FIXED;
+
+ randomize_baseva = (ticks & 15) * MMAP_PAGESIZE;
+
+ if (memfd->requested_va)
+ memfd->requested_va += randomize_baseva;
+
+ sh = memfd->sh =
+ (memfd_shared_header_t *) mmap ((void *) memfd->requested_va,
+ memfd->memfd_size, PROT_READ | PROT_WRITE,
+ flags, memfd->fd, 0);
+
+ if (memfd->sh == MAP_FAILED)
+ {
+ clib_unix_warning ("mmap");
+ close (memfd->fd);
+ return MEMFD_API_ERROR_MMAP;
+ }
+
+ memfd->my_pid = getpid ();
+ sh->master_pid = memfd->my_pid;
+ sh->memfd_size = memfd->memfd_size;
+ sh->heap = mheap_alloc_with_flags
+ (((u8 *) sh) + MMAP_PAGESIZE, memfd->memfd_size - MMAP_PAGESIZE,
+ MHEAP_FLAG_DISABLE_VM | MHEAP_FLAG_THREAD_SAFE);
+
+ sh->memfd_va = pointer_to_uword (sh);
+ sh->master_index = master_index;
+
+ oldheap = memfd_push_heap (sh);
+ sh->name = format (0, "%s%c", memfd->name, 0);
+ memfd_pop_heap (oldheap);
+
+ memfd->i_am_master = 1;
+
+ /* The application has to set set sh->ready... */
+ return 0;
+}
+
+/*
+ * Subtly different than svm_slave_init. The caller
+ * needs to acquire a usable file descriptor for the memfd segment
+ * e.g. via vppinfra/socket.c:default_socket_recvmsg
+ */
+
+int
+memfd_slave_init (memfd_private_t * memfd)
+{
+ memfd_shared_header_t *sh;
+
+ memfd->i_am_master = 0;
+
+ /* Map the segment once, to look at the shared header */
+ sh = (void *) mmap (0, MMAP_PAGESIZE, PROT_READ | PROT_WRITE, MAP_SHARED,
+ memfd->fd, 0);
+ if (sh == MAP_FAILED)
+ {
+ clib_unix_warning ("slave research mmap");
+ close (memfd->fd);
+ return MEMFD_API_ERROR_MMAP;
+ }
+
+ memfd->requested_va = (u64) sh->memfd_va;
+ memfd->memfd_size = sh->memfd_size;
+ munmap (sh, MMAP_PAGESIZE);
+
+ sh = memfd->sh =
+ (void *) mmap ((void *) memfd->requested_va, memfd->memfd_size,
+ PROT_READ | PROT_WRITE,
+ MAP_SHARED | MAP_FIXED, memfd->fd, 0);
+
+ if (sh == MAP_FAILED)
+ {
+ clib_unix_warning ("slave final mmap");
+ close (memfd->fd);
+ return MEMFD_API_ERROR_MMAP;
+ }
+ sh->slave_pid = getpid ();
+ return 0;
+}
+
+/*
+ * fd.io coding-style-patch-verification: ON
+ *
+ * Local Variables:
+ * eval: (c-set-style "gnu")
+ * End:
+ */
diff --git a/src/svm/memfd.h b/src/svm/memfd.h
new file mode 100644
index 0000000..3ed4a9a
--- /dev/null
+++ b/src/svm/memfd.h
@@ -0,0 +1,194 @@
+/*
+ * Copyright (c) 2017 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __included_memfd_h__
+#define __included_memfd_h__
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/types.h>
+#include <sys/mman.h>
+#include <sys/stat.h>
+#include <netinet/in.h>
+#include <signal.h>
+#include <pthread.h>
+#include <unistd.h>
+#include <time.h>
+#include <fcntl.h>
+#include <string.h>
+#include <vppinfra/clib.h>
+#include <vppinfra/vec.h>
+#include <vppinfra/hash.h>
+#include <vppinfra/bitmap.h>
+#include <vppinfra/fifo.h>
+#include <vppinfra/time.h>
+#include <vppinfra/mheap.h>
+#include <vppinfra/heap.h>
+#include <vppinfra/pool.h>
+#include <vppinfra/format.h>
+/* DGMS, memfd syscall not in glibc... */
+#include <vppinfra/linux/syscall.h>
+
+#ifndef MMAP_PAGESIZE
+#define MMAP_PAGESIZE (clib_mem_get_page_size())
+#endif
+
+#define MEMFD_N_OPAQUE 7
+
+typedef struct
+{
+ /* Spin-lock */
+ volatile u32 lock;
+ volatile u32 owner_pid;
+ int recursion_count;
+ u32 tag; /* for debugging */
+
+ /* The allocation arena */
+ void *heap;
+
+ /* Segment must be mapped at this address, or no supper */
+ u64 memfd_va;
+ /* The actual mmap size */
+ u64 memfd_size;
+ u32 master_pid;
+ u32 slave_pid;
+ u8 *name;
+ void *opaque[MEMFD_N_OPAQUE];
+
+ /* Set when the master application thinks it's time to make the donuts */
+ volatile u32 ready;
+
+ /* Needed to make unique MAC addresses, etc. */
+ u32 master_index;
+} memfd_shared_header_t;
+
+typedef struct
+{
+ memfd_shared_header_t *sh;
+ int fd;
+ u64 memfd_size;
+ u32 my_pid;
+ u32 vlib_hw_if_index;
+ uword requested_va;
+ int i_am_master;
+ u32 per_interface_next_index;
+ u32 *rx_queue;
+ u8 *name;
+} memfd_private_t;
+
+always_inline void
+memfd_lock (memfd_shared_header_t * h, u32 my_pid, u32 tag)
+{
+ if (h->owner_pid == my_pid)
+ {
+ h->recursion_count++;
+ return;
+ }
+
+ while (__sync_lock_test_and_set (&h->lock, 1))
+ ;
+
+ h->owner_pid = my_pid;
+ h->recursion_count = 1;
+ h->tag = tag;
+}
+
+always_inline void
+memfd_lock_non_recursive (memfd_shared_header_t * h, u32 tag)
+{
+ while (__sync_lock_test_and_set (&h->lock, 1))
+ ;
+
+ h->tag = tag;
+}
+
+always_inline void
+memfd_unlock (memfd_shared_header_t * h)
+{
+ if (--h->recursion_count == 0)
+ {
+ h->owner_pid = 0;
+ h->tag = 0;
+ CLIB_MEMORY_BARRIER ();
+ h->lock = 0;
+ }
+}
+
+always_inline void
+memfd_unlock_non_recursive (memfd_shared_header_t * h)
+{
+ h->tag = 0;
+ CLIB_MEMORY_BARRIER ();
+ h->lock = 0;
+}
+
+static inline void *
+memfd_push_heap (memfd_shared_header_t * sh)
+{
+ u8 *oldheap;
+ oldheap = clib_mem_set_heap (sh->heap);
+ return ((void *) oldheap);
+}
+
+static inline void
+memfd_pop_heap (void *oldheap)
+{
+ clib_mem_set_heap (oldheap);
+}
+
+#define foreach_memfd_api_error \
+_(NO_NAME, "No shared segment name", -100) \
+_(NO_SIZE, "Size not set (master)", -101) \
+_(CREATE_FAILURE, "Create failed", -102) \
+_(SET_SIZE, "Set size failed", -103) \
+_(MMAP, "mmap failed", -104) \
+_(SLAVE_TIMEOUT, "Slave map timeout", -105)
+
+typedef enum
+{
+#define _(n,s,c) MEMFD_API_ERROR_##n = c,
+ foreach_memfd_api_error
+#undef _
+} memfd_api_error_enum_t;
+
+#define MEMFD_API_ERROR_NO_NAME (-10)
+
+int memfd_master_init (memfd_private_t * memfd, u32 master_index);
+int memfd_slave_init (memfd_private_t * memfd);
+void memfd_delete (memfd_private_t * memfd);
+
+/* These do not belong here, but the original keeps running around... */
+/* $$$$ work w/ Damjan to fix properly */
+
+#ifndef F_LINUX_SPECIFIC_BASE
+#define F_LINUX_SPECIFIC_BASE 1024
+#endif
+#define MFD_ALLOW_SEALING 0x0002U
+#define F_ADD_SEALS (F_LINUX_SPECIFIC_BASE + 9)
+#define F_GET_SEALS (F_LINUX_SPECIFIC_BASE + 10)
+
+#define F_SEAL_SEAL 0x0001 /* prevent further seals from being set */
+#define F_SEAL_SHRINK 0x0002 /* prevent file from shrinking */
+#define F_SEAL_GROW 0x0004 /* prevent file from growing */
+#define F_SEAL_WRITE 0x0008 /* prevent writes */
+
+#endif /* __included_memfd_h__ */
+
+/*
+ * fd.io coding-style-patch-verification: ON
+ *
+ * Local Variables:
+ * eval: (c-set-style "gnu")
+ * End:
+ */
diff --git a/src/svm/svm.c b/src/svm/svm.c
index f97803c..c54f973 100644
--- a/src/svm/svm.c
+++ b/src/svm/svm.c
@@ -434,6 +434,107 @@
return (shm_name);
}
+void
+svm_region_init_mapped_region (svm_map_region_args_t * a, svm_region_t * rp)
+{
+ pthread_mutexattr_t attr;
+ pthread_condattr_t cattr;
+ int nbits, words, bit;
+ int overhead_space;
+ void *oldheap;
+ uword data_base;
+ ASSERT (rp);
+ int rv;
+
+ memset (rp, 0, sizeof (*rp));
+
+ if (pthread_mutexattr_init (&attr))
+ clib_unix_warning ("mutexattr_init");
+
+ if (pthread_mutexattr_setpshared (&attr, PTHREAD_PROCESS_SHARED))
+ clib_unix_warning ("mutexattr_setpshared");
+
+ if (pthread_mutex_init (&rp->mutex, &attr))
+ clib_unix_warning ("mutex_init");
+
+ if (pthread_mutexattr_destroy (&attr))
+ clib_unix_warning ("mutexattr_destroy");
+
+ if (pthread_condattr_init (&cattr))
+ clib_unix_warning ("condattr_init");
+
+ if (pthread_condattr_setpshared (&cattr, PTHREAD_PROCESS_SHARED))
+ clib_unix_warning ("condattr_setpshared");
+
+ if (pthread_cond_init (&rp->condvar, &cattr))
+ clib_unix_warning ("cond_init");
+
+ if (pthread_condattr_destroy (&cattr))
+ clib_unix_warning ("condattr_destroy");
+
+ region_lock (rp, 1);
+
+ rp->virtual_base = a->baseva;
+ rp->virtual_size = a->size;
+
+ rp->region_heap =
+ mheap_alloc_with_flags (uword_to_pointer
+ (a->baseva + MMAP_PAGESIZE, void *),
+ (a->pvt_heap_size !=
+ 0) ? a->pvt_heap_size : SVM_PVT_MHEAP_SIZE,
+ MHEAP_FLAG_DISABLE_VM);
+ oldheap = svm_push_pvt_heap (rp);
+
+ rp->region_name = (char *) format (0, "%s%c", a->name, 0);
+ vec_add1 (rp->client_pids, getpid ());
+
+ nbits = rp->virtual_size / MMAP_PAGESIZE;
+
+ ASSERT (nbits > 0);
+ rp->bitmap_size = nbits;
+ words = (nbits + BITS (uword) - 1) / BITS (uword);
+ vec_validate (rp->bitmap, words - 1);
+
+ overhead_space = MMAP_PAGESIZE /* header */ +
+ ((a->pvt_heap_size != 0) ? a->pvt_heap_size : SVM_PVT_MHEAP_SIZE);
+
+ bit = 0;
+ data_base = (uword) rp->virtual_base;
+
+ if (a->flags & SVM_FLAGS_NODATA)
+ rp->flags |= SVM_FLAGS_NEED_DATA_INIT;
+
+ do
+ {
+ clib_bitmap_set_no_check (rp->bitmap, bit, 1);
+ bit++;
+ overhead_space -= MMAP_PAGESIZE;
+ data_base += MMAP_PAGESIZE;
+ }
+ while (overhead_space > 0);
+
+ rp->data_base = (void *) data_base;
+
+ /*
+ * Note: although the POSIX spec guarantees that only one
+ * process enters this block, we have to play games
+ * to hold off clients until e.g. the mutex is ready
+ */
+ rp->version = SVM_VERSION;
+
+ /* setup the data portion of the region */
+
+ rv = svm_data_region_create (a, rp);
+ if (rv)
+ {
+ clib_warning ("data_region_create: %d", rv);
+ }
+
+ region_unlock (rp);
+
+ svm_pop_heap (oldheap);
+}
+
/*
* svm_map_region
*/
@@ -442,15 +543,10 @@
{
int svm_fd;
svm_region_t *rp;
- pthread_mutexattr_t attr;
- pthread_condattr_t cattr;
int deadman = 0;
u8 junk = 0;
void *oldheap;
- int overhead_space;
int rv;
- uword data_base;
- int nbits, words, bit;
int pid_holding_region_lock;
u8 *shm_name;
int dead_region_recovery = 0;
@@ -502,93 +598,8 @@
return (0);
}
close (svm_fd);
- memset (rp, 0, sizeof (*rp));
- if (pthread_mutexattr_init (&attr))
- clib_unix_warning ("mutexattr_init");
-
- if (pthread_mutexattr_setpshared (&attr, PTHREAD_PROCESS_SHARED))
- clib_unix_warning ("mutexattr_setpshared");
-
- if (pthread_mutex_init (&rp->mutex, &attr))
- clib_unix_warning ("mutex_init");
-
- if (pthread_mutexattr_destroy (&attr))
- clib_unix_warning ("mutexattr_destroy");
-
- if (pthread_condattr_init (&cattr))
- clib_unix_warning ("condattr_init");
-
- if (pthread_condattr_setpshared (&cattr, PTHREAD_PROCESS_SHARED))
- clib_unix_warning ("condattr_setpshared");
-
- if (pthread_cond_init (&rp->condvar, &cattr))
- clib_unix_warning ("cond_init");
-
- if (pthread_condattr_destroy (&cattr))
- clib_unix_warning ("condattr_destroy");
-
- region_lock (rp, 1);
-
- rp->virtual_base = a->baseva;
- rp->virtual_size = a->size;
-
- rp->region_heap =
- mheap_alloc_with_flags (uword_to_pointer
- (a->baseva + MMAP_PAGESIZE, void *),
- (a->pvt_heap_size !=
- 0) ? a->pvt_heap_size : SVM_PVT_MHEAP_SIZE,
- MHEAP_FLAG_DISABLE_VM);
- oldheap = svm_push_pvt_heap (rp);
-
- rp->region_name = (char *) format (0, "%s%c", a->name, 0);
- vec_add1 (rp->client_pids, getpid ());
-
- nbits = rp->virtual_size / MMAP_PAGESIZE;
-
- ASSERT (nbits > 0);
- rp->bitmap_size = nbits;
- words = (nbits + BITS (uword) - 1) / BITS (uword);
- vec_validate (rp->bitmap, words - 1);
-
- overhead_space = MMAP_PAGESIZE /* header */ +
- ((a->pvt_heap_size != 0) ? a->pvt_heap_size : SVM_PVT_MHEAP_SIZE);
-
- bit = 0;
- data_base = (uword) rp->virtual_base;
-
- if (a->flags & SVM_FLAGS_NODATA)
- rp->flags |= SVM_FLAGS_NEED_DATA_INIT;
-
- do
- {
- clib_bitmap_set_no_check (rp->bitmap, bit, 1);
- bit++;
- overhead_space -= MMAP_PAGESIZE;
- data_base += MMAP_PAGESIZE;
- }
- while (overhead_space > 0);
-
- rp->data_base = (void *) data_base;
-
- /*
- * Note: although the POSIX spec guarantees that only one
- * process enters this block, we have to play games
- * to hold off clients until e.g. the mutex is ready
- */
- rp->version = SVM_VERSION;
-
- /* setup the data portion of the region */
-
- rv = svm_data_region_create (a, rp);
- if (rv)
- {
- clib_warning ("data_region_create: %d", rv);
- }
-
- region_unlock (rp);
-
- svm_pop_heap (oldheap);
+ svm_region_init_mapped_region (a, rp);
return ((void *) rp);
}
diff --git a/src/svm/svm_common.h b/src/svm/svm_common.h
index ea3ec87..a716028 100644
--- a/src/svm/svm_common.h
+++ b/src/svm/svm_common.h
@@ -112,6 +112,8 @@
void *svm_region_find_or_create (svm_map_region_args_t * a);
void svm_region_init (void);
+void svm_region_init_mapped_region (svm_map_region_args_t * a,
+ svm_region_t * rp);
int svm_region_init_chroot (const char *root_path);
void svm_region_init_chroot_uid_gid (const char *root_path, int uid, int gid);
void svm_region_init_args (svm_map_region_args_t * a);
diff --git a/src/svm/svmdb.c b/src/svm/svmdb.c
index 043b092..03aa1f1 100644
--- a/src/svm/svmdb.c
+++ b/src/svm/svmdb.c
@@ -456,7 +456,7 @@
goto out;
}
- serialize_open_unix_file_descriptor (sm, fd);
+ serialize_open_clib_file_descriptor (sm, fd);
region_lock (client->db_rp, 20);
@@ -512,7 +512,7 @@
goto out;
}
- unserialize_open_unix_file_descriptor (sm, fd);
+ unserialize_open_clib_file_descriptor (sm, fd);
region_lock (client->db_rp, 21);
oldheap = svm_push_data_heap (client->db_rp);