Repair vlib API socket server

- Teach vpp_api_test to send/receive API messages over sockets
- Add memfd-based shared memory
- Add api messages to create memfd-based shared memory segments
- vpp_api_test supports both socket and shared memory segment connections
- vpp_api_test pivot from socket to shared memory API messaging
- add socket client support to libvlibclient.so
- dead client reaper sends ping messages, container-friendly
- dead client reaper falls back to kill (<pid>, 0) live checking
  if e.g. a python app goes silent for tens of seconds
- handle ping messages in python client support code
- teach show api ring about pairwise shared-memory segments
- fix ip probing of already resolved destinations (VPP-998)

We'll need this work to implement proper host-stack client isolation

Change-Id: Ic23b65f75c854d0393d9a2e9d6b122a9551be769
Signed-off-by: Dave Barach <dave@barachs.net>
Signed-off-by: Dave Wallace <dwallacelf@gmail.com>
Signed-off-by: Florin Coras <fcoras@cisco.com>
diff --git a/src/svm/memfd.c b/src/svm/memfd.c
new file mode 100644
index 0000000..9fe487d
--- /dev/null
+++ b/src/svm/memfd.c
@@ -0,0 +1,137 @@
+/*
+ * Copyright (c) 2017 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "memfd.h"
+
+int
+memfd_master_init (memfd_private_t * memfd, u32 master_index)
+{
+  int flags;
+  memfd_shared_header_t *sh;
+  u64 ticks = clib_cpu_time_now ();
+  u64 randomize_baseva;
+  void *oldheap;
+
+  if (memfd->memfd_size == 0)
+    return MEMFD_API_ERROR_NO_SIZE;
+
+  ASSERT (vec_c_string_is_terminated (memfd->name));
+  memfd->name = format (0, "memfd svm region %d", master_index);
+
+  memfd->fd = memfd_create ((char *) memfd->name, MFD_ALLOW_SEALING);
+  if (memfd->fd < 0)
+    {
+      clib_unix_warning ("create segment '%s'", memfd->name);
+      return MEMFD_API_ERROR_CREATE_FAILURE;
+    }
+
+  if ((ftruncate (memfd->fd, memfd->memfd_size)) == -1)
+    {
+      clib_unix_warning ("set memfd size");
+      return MEMFD_API_ERROR_SET_SIZE;
+    }
+
+  if ((fcntl (memfd->fd, F_ADD_SEALS, F_SEAL_SHRINK)) == -1)
+    clib_unix_warning ("fcntl (F_ADD_SEALS, F_SEAL_SHRINK)");
+
+  flags = MAP_SHARED;
+  if (memfd->requested_va)
+    flags |= MAP_FIXED;
+
+  randomize_baseva = (ticks & 15) * MMAP_PAGESIZE;
+
+  if (memfd->requested_va)
+    memfd->requested_va += randomize_baseva;
+
+  sh = memfd->sh =
+    (memfd_shared_header_t *) mmap ((void *) memfd->requested_va,
+				    memfd->memfd_size, PROT_READ | PROT_WRITE,
+				    flags, memfd->fd, 0);
+
+  if (memfd->sh == MAP_FAILED)
+    {
+      clib_unix_warning ("mmap");
+      close (memfd->fd);
+      return MEMFD_API_ERROR_MMAP;
+    }
+
+  memfd->my_pid = getpid ();
+  sh->master_pid = memfd->my_pid;
+  sh->memfd_size = memfd->memfd_size;
+  sh->heap = mheap_alloc_with_flags
+    (((u8 *) sh) + MMAP_PAGESIZE, memfd->memfd_size - MMAP_PAGESIZE,
+     MHEAP_FLAG_DISABLE_VM | MHEAP_FLAG_THREAD_SAFE);
+
+  sh->memfd_va = pointer_to_uword (sh);
+  sh->master_index = master_index;
+
+  oldheap = memfd_push_heap (sh);
+  sh->name = format (0, "%s%c", memfd->name, 0);
+  memfd_pop_heap (oldheap);
+
+  memfd->i_am_master = 1;
+
+  /* The application has to set set sh->ready... */
+  return 0;
+}
+
+/*
+ * Subtly different than svm_slave_init. The caller
+ * needs to acquire a usable file descriptor for the memfd segment
+ * e.g. via vppinfra/socket.c:default_socket_recvmsg
+ */
+
+int
+memfd_slave_init (memfd_private_t * memfd)
+{
+  memfd_shared_header_t *sh;
+
+  memfd->i_am_master = 0;
+
+  /* Map the segment once, to look at the shared header */
+  sh = (void *) mmap (0, MMAP_PAGESIZE, PROT_READ | PROT_WRITE, MAP_SHARED,
+		      memfd->fd, 0);
+  if (sh == MAP_FAILED)
+    {
+      clib_unix_warning ("slave research mmap");
+      close (memfd->fd);
+      return MEMFD_API_ERROR_MMAP;
+    }
+
+  memfd->requested_va = (u64) sh->memfd_va;
+  memfd->memfd_size = sh->memfd_size;
+  munmap (sh, MMAP_PAGESIZE);
+
+  sh = memfd->sh =
+    (void *) mmap ((void *) memfd->requested_va, memfd->memfd_size,
+		   PROT_READ | PROT_WRITE,
+		   MAP_SHARED | MAP_FIXED, memfd->fd, 0);
+
+  if (sh == MAP_FAILED)
+    {
+      clib_unix_warning ("slave final mmap");
+      close (memfd->fd);
+      return MEMFD_API_ERROR_MMAP;
+    }
+  sh->slave_pid = getpid ();
+  return 0;
+}
+
+/*
+ * fd.io coding-style-patch-verification: ON
+ *
+ * Local Variables:
+ * eval: (c-set-style "gnu")
+ * End:
+ */
diff --git a/src/svm/memfd.h b/src/svm/memfd.h
new file mode 100644
index 0000000..3ed4a9a
--- /dev/null
+++ b/src/svm/memfd.h
@@ -0,0 +1,194 @@
+/*
+ * Copyright (c) 2017 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __included_memfd_h__
+#define __included_memfd_h__
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/types.h>
+#include <sys/mman.h>
+#include <sys/stat.h>
+#include <netinet/in.h>
+#include <signal.h>
+#include <pthread.h>
+#include <unistd.h>
+#include <time.h>
+#include <fcntl.h>
+#include <string.h>
+#include <vppinfra/clib.h>
+#include <vppinfra/vec.h>
+#include <vppinfra/hash.h>
+#include <vppinfra/bitmap.h>
+#include <vppinfra/fifo.h>
+#include <vppinfra/time.h>
+#include <vppinfra/mheap.h>
+#include <vppinfra/heap.h>
+#include <vppinfra/pool.h>
+#include <vppinfra/format.h>
+/* DGMS, memfd syscall not in glibc... */
+#include <vppinfra/linux/syscall.h>
+
+#ifndef MMAP_PAGESIZE
+#define MMAP_PAGESIZE (clib_mem_get_page_size())
+#endif
+
+#define MEMFD_N_OPAQUE 7
+
+typedef struct
+{
+  /* Spin-lock */
+  volatile u32 lock;
+  volatile u32 owner_pid;
+  int recursion_count;
+  u32 tag;			/* for debugging */
+
+  /* The allocation arena */
+  void *heap;
+
+  /* Segment must be mapped at this address, or no supper */
+  u64 memfd_va;
+  /* The actual mmap size */
+  u64 memfd_size;
+  u32 master_pid;
+  u32 slave_pid;
+  u8 *name;
+  void *opaque[MEMFD_N_OPAQUE];
+
+  /* Set when the master application thinks it's time to make the donuts */
+  volatile u32 ready;
+
+  /* Needed to make unique MAC addresses, etc. */
+  u32 master_index;
+} memfd_shared_header_t;
+
+typedef struct
+{
+  memfd_shared_header_t *sh;
+  int fd;
+  u64 memfd_size;
+  u32 my_pid;
+  u32 vlib_hw_if_index;
+  uword requested_va;
+  int i_am_master;
+  u32 per_interface_next_index;
+  u32 *rx_queue;
+  u8 *name;
+} memfd_private_t;
+
+always_inline void
+memfd_lock (memfd_shared_header_t * h, u32 my_pid, u32 tag)
+{
+  if (h->owner_pid == my_pid)
+    {
+      h->recursion_count++;
+      return;
+    }
+
+  while (__sync_lock_test_and_set (&h->lock, 1))
+    ;
+
+  h->owner_pid = my_pid;
+  h->recursion_count = 1;
+  h->tag = tag;
+}
+
+always_inline void
+memfd_lock_non_recursive (memfd_shared_header_t * h, u32 tag)
+{
+  while (__sync_lock_test_and_set (&h->lock, 1))
+    ;
+
+  h->tag = tag;
+}
+
+always_inline void
+memfd_unlock (memfd_shared_header_t * h)
+{
+  if (--h->recursion_count == 0)
+    {
+      h->owner_pid = 0;
+      h->tag = 0;
+      CLIB_MEMORY_BARRIER ();
+      h->lock = 0;
+    }
+}
+
+always_inline void
+memfd_unlock_non_recursive (memfd_shared_header_t * h)
+{
+  h->tag = 0;
+  CLIB_MEMORY_BARRIER ();
+  h->lock = 0;
+}
+
+static inline void *
+memfd_push_heap (memfd_shared_header_t * sh)
+{
+  u8 *oldheap;
+  oldheap = clib_mem_set_heap (sh->heap);
+  return ((void *) oldheap);
+}
+
+static inline void
+memfd_pop_heap (void *oldheap)
+{
+  clib_mem_set_heap (oldheap);
+}
+
+#define foreach_memfd_api_error                  \
+_(NO_NAME, "No shared segment name", -100)      \
+_(NO_SIZE, "Size not set (master)", -101)       \
+_(CREATE_FAILURE, "Create failed", -102)        \
+_(SET_SIZE, "Set size failed", -103)		\
+_(MMAP, "mmap failed", -104)			\
+_(SLAVE_TIMEOUT, "Slave map timeout", -105)
+
+typedef enum
+{
+#define _(n,s,c) MEMFD_API_ERROR_##n = c,
+  foreach_memfd_api_error
+#undef _
+} memfd_api_error_enum_t;
+
+#define MEMFD_API_ERROR_NO_NAME	(-10)
+
+int memfd_master_init (memfd_private_t * memfd, u32 master_index);
+int memfd_slave_init (memfd_private_t * memfd);
+void memfd_delete (memfd_private_t * memfd);
+
+/* These do not belong here, but the original keeps running around... */
+/* $$$$ work w/ Damjan to fix properly */
+
+#ifndef F_LINUX_SPECIFIC_BASE
+#define F_LINUX_SPECIFIC_BASE 1024
+#endif
+#define MFD_ALLOW_SEALING       0x0002U
+#define F_ADD_SEALS (F_LINUX_SPECIFIC_BASE + 9)
+#define F_GET_SEALS (F_LINUX_SPECIFIC_BASE + 10)
+
+#define F_SEAL_SEAL     0x0001	/* prevent further seals from being set */
+#define F_SEAL_SHRINK   0x0002	/* prevent file from shrinking */
+#define F_SEAL_GROW     0x0004	/* prevent file from growing */
+#define F_SEAL_WRITE    0x0008	/* prevent writes */
+
+#endif /* __included_memfd_h__ */
+
+/*
+ * fd.io coding-style-patch-verification: ON
+ *
+ * Local Variables:
+ * eval: (c-set-style "gnu")
+ * End:
+ */
diff --git a/src/svm/svm.c b/src/svm/svm.c
index f97803c..c54f973 100644
--- a/src/svm/svm.c
+++ b/src/svm/svm.c
@@ -434,6 +434,107 @@
   return (shm_name);
 }
 
+void
+svm_region_init_mapped_region (svm_map_region_args_t * a, svm_region_t * rp)
+{
+  pthread_mutexattr_t attr;
+  pthread_condattr_t cattr;
+  int nbits, words, bit;
+  int overhead_space;
+  void *oldheap;
+  uword data_base;
+  ASSERT (rp);
+  int rv;
+
+  memset (rp, 0, sizeof (*rp));
+
+  if (pthread_mutexattr_init (&attr))
+    clib_unix_warning ("mutexattr_init");
+
+  if (pthread_mutexattr_setpshared (&attr, PTHREAD_PROCESS_SHARED))
+    clib_unix_warning ("mutexattr_setpshared");
+
+  if (pthread_mutex_init (&rp->mutex, &attr))
+    clib_unix_warning ("mutex_init");
+
+  if (pthread_mutexattr_destroy (&attr))
+    clib_unix_warning ("mutexattr_destroy");
+
+  if (pthread_condattr_init (&cattr))
+    clib_unix_warning ("condattr_init");
+
+  if (pthread_condattr_setpshared (&cattr, PTHREAD_PROCESS_SHARED))
+    clib_unix_warning ("condattr_setpshared");
+
+  if (pthread_cond_init (&rp->condvar, &cattr))
+    clib_unix_warning ("cond_init");
+
+  if (pthread_condattr_destroy (&cattr))
+    clib_unix_warning ("condattr_destroy");
+
+  region_lock (rp, 1);
+
+  rp->virtual_base = a->baseva;
+  rp->virtual_size = a->size;
+
+  rp->region_heap =
+    mheap_alloc_with_flags (uword_to_pointer
+			    (a->baseva + MMAP_PAGESIZE, void *),
+			    (a->pvt_heap_size !=
+			     0) ? a->pvt_heap_size : SVM_PVT_MHEAP_SIZE,
+			    MHEAP_FLAG_DISABLE_VM);
+  oldheap = svm_push_pvt_heap (rp);
+
+  rp->region_name = (char *) format (0, "%s%c", a->name, 0);
+  vec_add1 (rp->client_pids, getpid ());
+
+  nbits = rp->virtual_size / MMAP_PAGESIZE;
+
+  ASSERT (nbits > 0);
+  rp->bitmap_size = nbits;
+  words = (nbits + BITS (uword) - 1) / BITS (uword);
+  vec_validate (rp->bitmap, words - 1);
+
+  overhead_space = MMAP_PAGESIZE /* header */  +
+    ((a->pvt_heap_size != 0) ? a->pvt_heap_size : SVM_PVT_MHEAP_SIZE);
+
+  bit = 0;
+  data_base = (uword) rp->virtual_base;
+
+  if (a->flags & SVM_FLAGS_NODATA)
+    rp->flags |= SVM_FLAGS_NEED_DATA_INIT;
+
+  do
+    {
+      clib_bitmap_set_no_check (rp->bitmap, bit, 1);
+      bit++;
+      overhead_space -= MMAP_PAGESIZE;
+      data_base += MMAP_PAGESIZE;
+    }
+  while (overhead_space > 0);
+
+  rp->data_base = (void *) data_base;
+
+  /*
+   * Note: although the POSIX spec guarantees that only one
+   * process enters this block, we have to play games
+   * to hold off clients until e.g. the mutex is ready
+   */
+  rp->version = SVM_VERSION;
+
+  /* setup the data portion of the region */
+
+  rv = svm_data_region_create (a, rp);
+  if (rv)
+    {
+      clib_warning ("data_region_create: %d", rv);
+    }
+
+  region_unlock (rp);
+
+  svm_pop_heap (oldheap);
+}
+
 /*
  * svm_map_region
  */
@@ -442,15 +543,10 @@
 {
   int svm_fd;
   svm_region_t *rp;
-  pthread_mutexattr_t attr;
-  pthread_condattr_t cattr;
   int deadman = 0;
   u8 junk = 0;
   void *oldheap;
-  int overhead_space;
   int rv;
-  uword data_base;
-  int nbits, words, bit;
   int pid_holding_region_lock;
   u8 *shm_name;
   int dead_region_recovery = 0;
@@ -502,93 +598,8 @@
 	  return (0);
 	}
       close (svm_fd);
-      memset (rp, 0, sizeof (*rp));
 
-      if (pthread_mutexattr_init (&attr))
-	clib_unix_warning ("mutexattr_init");
-
-      if (pthread_mutexattr_setpshared (&attr, PTHREAD_PROCESS_SHARED))
-	clib_unix_warning ("mutexattr_setpshared");
-
-      if (pthread_mutex_init (&rp->mutex, &attr))
-	clib_unix_warning ("mutex_init");
-
-      if (pthread_mutexattr_destroy (&attr))
-	clib_unix_warning ("mutexattr_destroy");
-
-      if (pthread_condattr_init (&cattr))
-	clib_unix_warning ("condattr_init");
-
-      if (pthread_condattr_setpshared (&cattr, PTHREAD_PROCESS_SHARED))
-	clib_unix_warning ("condattr_setpshared");
-
-      if (pthread_cond_init (&rp->condvar, &cattr))
-	clib_unix_warning ("cond_init");
-
-      if (pthread_condattr_destroy (&cattr))
-	clib_unix_warning ("condattr_destroy");
-
-      region_lock (rp, 1);
-
-      rp->virtual_base = a->baseva;
-      rp->virtual_size = a->size;
-
-      rp->region_heap =
-	mheap_alloc_with_flags (uword_to_pointer
-				(a->baseva + MMAP_PAGESIZE, void *),
-				(a->pvt_heap_size !=
-				 0) ? a->pvt_heap_size : SVM_PVT_MHEAP_SIZE,
-				MHEAP_FLAG_DISABLE_VM);
-      oldheap = svm_push_pvt_heap (rp);
-
-      rp->region_name = (char *) format (0, "%s%c", a->name, 0);
-      vec_add1 (rp->client_pids, getpid ());
-
-      nbits = rp->virtual_size / MMAP_PAGESIZE;
-
-      ASSERT (nbits > 0);
-      rp->bitmap_size = nbits;
-      words = (nbits + BITS (uword) - 1) / BITS (uword);
-      vec_validate (rp->bitmap, words - 1);
-
-      overhead_space = MMAP_PAGESIZE /* header */  +
-	((a->pvt_heap_size != 0) ? a->pvt_heap_size : SVM_PVT_MHEAP_SIZE);
-
-      bit = 0;
-      data_base = (uword) rp->virtual_base;
-
-      if (a->flags & SVM_FLAGS_NODATA)
-	rp->flags |= SVM_FLAGS_NEED_DATA_INIT;
-
-      do
-	{
-	  clib_bitmap_set_no_check (rp->bitmap, bit, 1);
-	  bit++;
-	  overhead_space -= MMAP_PAGESIZE;
-	  data_base += MMAP_PAGESIZE;
-	}
-      while (overhead_space > 0);
-
-      rp->data_base = (void *) data_base;
-
-      /*
-       * Note: although the POSIX spec guarantees that only one
-       * process enters this block, we have to play games
-       * to hold off clients until e.g. the mutex is ready
-       */
-      rp->version = SVM_VERSION;
-
-      /* setup the data portion of the region */
-
-      rv = svm_data_region_create (a, rp);
-      if (rv)
-	{
-	  clib_warning ("data_region_create: %d", rv);
-	}
-
-      region_unlock (rp);
-
-      svm_pop_heap (oldheap);
+      svm_region_init_mapped_region (a, rp);
 
       return ((void *) rp);
     }
diff --git a/src/svm/svm_common.h b/src/svm/svm_common.h
index ea3ec87..a716028 100644
--- a/src/svm/svm_common.h
+++ b/src/svm/svm_common.h
@@ -112,6 +112,8 @@
 
 void *svm_region_find_or_create (svm_map_region_args_t * a);
 void svm_region_init (void);
+void svm_region_init_mapped_region (svm_map_region_args_t * a,
+				    svm_region_t * rp);
 int svm_region_init_chroot (const char *root_path);
 void svm_region_init_chroot_uid_gid (const char *root_path, int uid, int gid);
 void svm_region_init_args (svm_map_region_args_t * a);
diff --git a/src/svm/svmdb.c b/src/svm/svmdb.c
index 043b092..03aa1f1 100644
--- a/src/svm/svmdb.c
+++ b/src/svm/svmdb.c
@@ -456,7 +456,7 @@
       goto out;
     }
 
-  serialize_open_unix_file_descriptor (sm, fd);
+  serialize_open_clib_file_descriptor (sm, fd);
 
   region_lock (client->db_rp, 20);
 
@@ -512,7 +512,7 @@
       goto out;
     }
 
-  unserialize_open_unix_file_descriptor (sm, fd);
+  unserialize_open_clib_file_descriptor (sm, fd);
 
   region_lock (client->db_rp, 21);
   oldheap = svm_push_data_heap (client->db_rp);