32/64 shmem bihash interoperability

Move the binary api segment above 4gb

Change-Id: I40e8aa7a97722a32397f5a538b5ff8344c50d408
Signed-off-by: Dave Barach <dave@barachs.net>
diff --git a/src/vppinfra/bihash_template.c b/src/vppinfra/bihash_template.c
index e13ceb7..fa92c8b 100644
--- a/src/vppinfra/bihash_template.c
+++ b/src/vppinfra/bihash_template.c
@@ -23,10 +23,10 @@
   nbytes += CLIB_CACHE_LINE_BYTES - 1;
   nbytes &= ~(CLIB_CACHE_LINE_BYTES - 1);
 
-  rv = h->alloc_arena_next;
-  h->alloc_arena_next += nbytes;
+  rv = alloc_arena_next (h);
+  alloc_arena_next (h) += nbytes;
 
-  if (rv >= (h->alloc_arena + h->alloc_arena_size))
+  if (rv >= (alloc_arena (h) + alloc_arena_size (h)))
     os_out_of_memory ();
 
   return (void *) rv;
@@ -52,9 +52,9 @@
    */
   ASSERT (memory_size < (1ULL << BIHASH_BUCKET_OFFSET_BITS));
 
-  h->alloc_arena = (uword) clib_mem_vm_alloc (memory_size);
-  h->alloc_arena_next = h->alloc_arena;
-  h->alloc_arena_size = memory_size;
+  alloc_arena (h) = (uword) clib_mem_vm_alloc (memory_size);
+  alloc_arena_next (h) = alloc_arena (h);
+  alloc_arena_size (h) = memory_size;
 
   bucket_size = nbuckets * sizeof (h->buckets[0]);
   h->buckets = BV (alloc_aligned) (h, bucket_size);
@@ -65,6 +65,129 @@
   h->fmt_fn = NULL;
 }
 
+#if BIHASH_32_64_SVM
+#if !defined (MFD_ALLOW_SEALING)
+#define MFD_ALLOW_SEALING 0x0002U
+#endif
+
+void BV (clib_bihash_master_init_svm)
+  (BVT (clib_bihash) * h, char *name, u32 nbuckets,
+   u64 base_address, u64 memory_size)
+{
+  uword bucket_size;
+  u8 *mmap_addr;
+  vec_header_t *freelist_vh;
+  int fd;
+
+  ASSERT (base_address);
+  ASSERT (base_address + memory_size < (1ULL << 32));
+
+  /* Set up for memfd sharing */
+  if ((fd = memfd_create (name, MFD_ALLOW_SEALING)) == -1)
+    {
+      clib_unix_warning ("memfd_create");
+      return;
+    }
+
+  if (ftruncate (fd, memory_size) < 0)
+    {
+      clib_unix_warning ("ftruncate");
+      return;
+    }
+
+  /* Not mission-critical, complain and continue */
+  if ((fcntl (fd, F_ADD_SEALS, F_SEAL_SHRINK)) == -1)
+    clib_unix_warning ("fcntl (F_ADD_SEALS)");
+
+  mmap_addr = mmap (u64_to_pointer (base_address), memory_size,
+		    PROT_READ | PROT_WRITE, MAP_SHARED | MAP_FIXED, fd,
+		    0 /* offset */ );
+
+  if (mmap_addr == MAP_FAILED)
+    {
+      clib_unix_warning ("mmap failed");
+      ASSERT (0);
+    }
+
+  h->sh = (void *) mmap_addr;
+  h->memfd = fd;
+  nbuckets = 1 << (max_log2 (nbuckets));
+
+  h->name = (u8 *) name;
+  h->sh->nbuckets = h->nbuckets = nbuckets;
+  h->log2_nbuckets = max_log2 (nbuckets);
+
+  alloc_arena (h) = (u64) (uword) mmap_addr;
+  alloc_arena_next (h) = alloc_arena (h) + CLIB_CACHE_LINE_BYTES;
+  alloc_arena_size (h) = memory_size;
+
+  bucket_size = nbuckets * sizeof (h->buckets[0]);
+  h->buckets = BV (alloc_aligned) (h, bucket_size);
+  h->sh->buckets_as_u64 = (u64) (uword) h->buckets;
+
+  h->alloc_lock = BV (alloc_aligned) (h, CLIB_CACHE_LINE_BYTES);
+  h->alloc_lock[0] = 0;
+
+  h->sh->alloc_lock_as_u64 = (u64) (uword) (h->alloc_lock);
+  freelist_vh = BV (alloc_aligned) (h, sizeof (vec_header_t) +
+				    BIHASH_FREELIST_LENGTH * sizeof (u64));
+  freelist_vh->len = BIHASH_FREELIST_LENGTH;
+  freelist_vh->dlmalloc_header_offset = 0xDEADBEEF;
+  h->sh->freelists_as_u64 = (u64) (uword) freelist_vh->vector_data;
+  h->freelists = (void *) (uword) (h->sh->freelists_as_u64);
+
+  h->fmt_fn = NULL;
+}
+
+void BV (clib_bihash_slave_init_svm)
+  (BVT (clib_bihash) * h, char *name, int fd)
+{
+  u8 *mmap_addr;
+  u64 base_address, memory_size;
+  BVT (clib_bihash_shared_header) * sh;
+
+  /* Trial mapping, to place the segment */
+  mmap_addr = mmap (0, 4096, PROT_READ, MAP_SHARED, fd, 0 /* offset */ );
+  if (mmap_addr == MAP_FAILED)
+    {
+      clib_unix_warning ("trial mmap failed");
+      ASSERT (0);
+    }
+
+  sh = (BVT (clib_bihash_shared_header) *) mmap_addr;
+
+  base_address = sh->alloc_arena;
+  memory_size = sh->alloc_arena_size;
+
+  munmap (mmap_addr, 4096);
+
+  /* Actual mapping, at the required address */
+  mmap_addr = mmap (u64_to_pointer (base_address), memory_size,
+		    PROT_READ | PROT_WRITE, MAP_SHARED | MAP_FIXED, fd,
+		    0 /* offset */ );
+
+  if (mmap_addr == MAP_FAILED)
+    {
+      clib_unix_warning ("mmap failed");
+      ASSERT (0);
+    }
+
+  (void) close (fd);
+
+  h->sh = (void *) mmap_addr;
+  h->memfd = -1;
+
+  h->name = (u8 *) name;
+  h->buckets = u64_to_pointer (h->sh->buckets_as_u64);
+  h->nbuckets = h->sh->nbuckets;
+  h->log2_nbuckets = max_log2 (h->nbuckets);
+
+  h->alloc_lock = u64_to_pointer (h->sh->alloc_lock_as_u64);
+  h->freelists = u64_to_pointer (h->sh->freelists_as_u64);
+  h->fmt_fn = NULL;
+}
+#endif /* BIHASH_32_64_SVM */
+
 void BV (clib_bihash_set_kvp_format_fn) (BVT (clib_bihash) * h,
 					 format_function_t * fmt_fn)
 {
@@ -74,8 +197,13 @@
 void BV (clib_bihash_free) (BVT (clib_bihash) * h)
 {
   vec_free (h->working_copies);
+#if BIHASH_32_64_SVM == 0
   vec_free (h->freelists);
-  clib_mem_vm_free ((void *) (h->alloc_arena), h->alloc_arena_size);
+#else
+  if (h->memfd > 0)
+    (void) close (h->memfd);
+#endif
+  clib_mem_vm_free ((void *) (uword) (alloc_arena (h)), alloc_arena_size (h));
   memset (h, 0, sizeof (*h));
 }
 
@@ -86,14 +214,19 @@
   BVT (clib_bihash_value) * rv = 0;
 
   ASSERT (h->alloc_lock[0]);
+
+#if BIHASH_32_64_SVM
+  ASSERT (log2_pages < vec_len (h->freelists));
+#endif
+
   if (log2_pages >= vec_len (h->freelists) || h->freelists[log2_pages] == 0)
     {
       vec_validate_init_empty (h->freelists, log2_pages, 0);
       rv = BV (alloc_aligned) (h, (sizeof (*rv) * (1 << log2_pages)));
       goto initialize;
     }
-  rv = h->freelists[log2_pages];
-  h->freelists[log2_pages] = rv->next_free;
+  rv = (void *) (uword) h->freelists[log2_pages];
+  h->freelists[log2_pages] = rv->next_free_as_u64;
 
 initialize:
   ASSERT (rv);
@@ -117,8 +250,8 @@
   if (CLIB_DEBUG > 0)
     memset (v, 0xFE, sizeof (*v) * (1 << log2_pages));
 
-  v->next_free = h->freelists[log2_pages];
-  h->freelists[log2_pages] = v;
+  v->next_free_as_u64 = (u64) h->freelists[log2_pages];
+  h->freelists[log2_pages] = (u64) (uword) v;
 }
 
 static inline void
@@ -361,6 +494,7 @@
 	      CLIB_MEMORY_BARRIER ();	/* Make sure the value has settled */
 	      clib_memcpy (&(v->kvp[i]), &add_v->key, sizeof (add_v->key));
 	      b->refcnt++;
+	      ASSERT (b->refcnt > 0);
 	      BV (clib_bihash_unlock_bucket) (b);
 	      return (0);
 	    }
@@ -490,6 +624,7 @@
   tmp_b.offset = BV (clib_bihash_get_offset) (h, save_new_v);
   tmp_b.linear_search = mark_bucket_linear;
   tmp_b.refcnt = h->saved_bucket.refcnt + 1;
+  ASSERT (tmp_b.refcnt > 0);
   tmp_b.lock = 0;
   CLIB_MEMORY_BARRIER ();
   b->as_u64 = tmp_b.as_u64;
@@ -587,7 +722,7 @@
 
       if (verbose)
 	{
-	  s = format (s, "[%d]: heap offset %d, len %d, linear %d\n", i,
+	  s = format (s, "[%d]: heap offset %lld, len %d, linear %d\n", i,
 		      b->offset, (1 << b->log2_pages), b->linear_search);
 	}
 
@@ -633,24 +768,25 @@
       u32 nfree = 0;
       BVT (clib_bihash_value) * free_elt;
 
-      free_elt = h->freelists[i];
+      free_elt = (void *) (uword) h->freelists[i];
       while (free_elt)
 	{
 	  nfree++;
-	  free_elt = free_elt->next_free;
+	  free_elt = (void *) (uword) free_elt->next_free_as_u64;
 	}
 
-      s = format (s, "       [len %d] %u free elts\n", 1 << i, nfree);
+      if (nfree || verbose)
+	s = format (s, "       [len %d] %u free elts\n", 1 << i, nfree);
     }
 
   s = format (s, "    %lld linear search buckets\n", linear_buckets);
-  used_bytes = h->alloc_arena_next - h->alloc_arena;
+  used_bytes = alloc_arena_next (h) - alloc_arena (h);
   s = format (s,
 	      "    arena: base %llx, next %llx\n"
 	      "           used %lld b (%lld Mbytes) of %lld b (%lld Mbytes)\n",
-	      h->alloc_arena, h->alloc_arena_next,
+	      alloc_arena (h), alloc_arena_next (h),
 	      used_bytes, used_bytes >> 20,
-	      h->alloc_arena_size, h->alloc_arena_size >> 20);
+	      alloc_arena_size (h), alloc_arena_size (h) >> 20);
   return s;
 }