Add live migration support to (non-dpdk) vhost-user driver

This patch adds live migration support to vhost interfaces, by supporting the VHOST_F_LOG_ALL feature. When qemu starts a migration, it will negotiate this feature, and provide a fd for a "dirty log" shared mem space. This log is a bitmap representing pages in the device memory.
Whenever we touch memory pointed by a "desc" vring, or modify a "used" vring, we log the corresponding page in the bitmap. This allows qemu to send the dirty page to the destination host.
See https://github.com/qemu/qemu/blob/master/docs/specs/vhost-user.txt, ยง "Live migration" for more details.
In addition to this, this code provides support for the VHOST_USER_F_PROTOCOL_FEATURES feature, and to VHOST_USER_{GET,SET}_PROTOCOL_FEATURES and VHOST_USER_SET_VRING_ENABLE messages, required for live migration.

Change-Id: I7577efce8bd67653218f4291af1d651de451e552
Signed-off-by: Yoann Desmouceaux <ydesmouc@cisco.com>
diff --git a/vnet/vnet/devices/virtio/vhost-user.c b/vnet/vnet/devices/virtio/vhost-user.c
index df09db8..945f03a 100644
--- a/vnet/vnet/devices/virtio/vhost-user.c
+++ b/vnet/vnet/devices/virtio/vhost-user.c
@@ -230,12 +230,38 @@
     vui->vrings[q].desc = NULL;
     vui->vrings[q].avail = NULL;
     vui->vrings[q].used = NULL;
+    vui->vrings[q].log_guest_addr = 0;
   }
 
   unmap_all_mem_regions(vui);
   DBG_SOCK("interface ifindex %d disconnected", vui->sw_if_index);
 }
 
+#define VHOST_LOG_PAGE 0x1000
+always_inline void vhost_user_log_dirty_pages(vhost_user_intf_t * vui,
+                                                u64 addr, u64 len)
+{
+  if (PREDICT_TRUE(vui->log_base_addr == 0
+                   || !(vui->features & (1 << FEAT_VHOST_F_LOG_ALL)))) {
+    return;
+  }
+  if (PREDICT_FALSE((addr + len - 1) / VHOST_LOG_PAGE / 8 >= vui->log_size)) {
+    DBG_SOCK("vhost_user_log_dirty_pages(): out of range\n");
+    return;
+  }
+
+  CLIB_MEMORY_BARRIER();
+  u64 page = addr / VHOST_LOG_PAGE;
+  while (page * VHOST_LOG_PAGE < addr + len) {
+    ((u8*)vui->log_base_addr)[page / 8] |= 1 << page % 8;
+    page++;
+  }
+}
+
+#define vhost_user_log_dirty_ring(vui, vq, member) \
+  vhost_user_log_dirty_pages(vui, vq->log_guest_addr + offsetof(vring_used_t, member), \
+                             sizeof(vq->used->member))
+
 static clib_error_t * vhost_user_socket_read (unix_file_t * uf)
 {
   int n, i;
@@ -313,7 +339,10 @@
 
       msg.flags |= 4;
       msg.u64 = (1 << FEAT_VIRTIO_NET_F_MRG_RXBUF) |
-                (1 << FEAT_VIRTIO_F_ANY_LAYOUT);
+                (1 << FEAT_VIRTIO_F_ANY_LAYOUT) |
+                (1 << FEAT_VHOST_F_LOG_ALL) |
+                (1 << FEAT_VIRTIO_NET_F_GUEST_ANNOUNCE) |
+                (1 << FEAT_VHOST_USER_F_PROTOCOL_FEATURES);
       msg.u64 &= vui->feature_mask;
 
       msg.size = sizeof(msg.u64);
@@ -324,6 +353,7 @@
         vui->hw_if_index, msg.u64);
 
       vui->features = msg.u64;
+
       if (vui->features & (1 << FEAT_VIRTIO_NET_F_MRG_RXBUF))
         vui->virtio_net_hdr_sz = 12;
       else
@@ -339,6 +369,7 @@
         vui->vrings[q].desc = 0;
         vui->vrings[q].avail = 0;
         vui->vrings[q].used = 0;
+        vui->vrings[q].log_guest_addr = 0;
       }
 
       DBG_SOCK("interface %d disconnected", vui->sw_if_index);
@@ -419,6 +450,15 @@
         goto close_socket;
       }
 
+      vui->vrings[msg.state.index].log_guest_addr = msg.addr.log_guest_addr;
+
+      /* Spec says: If VHOST_USER_F_PROTOCOL_FEATURES has not been negotiated,
+       the ring is initialized in an enabled state. */
+
+      if (!(vui->features & (1 << FEAT_VHOST_USER_F_PROTOCOL_FEATURES))) {
+        vui->vrings[msg.state.index].enabled = 1;
+      }
+
       vui->vrings[msg.state.index].last_used_idx =
           vui->vrings[msg.state.index].used->idx;
 
@@ -509,7 +549,10 @@
       DBG_SOCK("if %d msg VHOST_USER_GET_VRING_BASE idx %d num %d",
         vui->hw_if_index, msg.state.index, msg.state.num);
 
-      msg.state.num = vui->vrings[msg.state.index].last_used_idx;
+      /* Spec says: Client must [...] stop ring upon receiving VHOST_USER_GET_VRING_BASE. */
+      vui->vrings[msg.state.index].enabled = 0;
+
+      msg.state.num = vui->vrings[msg.state.index].last_avail_idx;
       msg.flags |= 4;
       msg.size = sizeof(msg.state);
       break;
@@ -521,10 +564,45 @@
       break;
 
     case VHOST_USER_SET_LOG_BASE:
+    {
       DBG_SOCK("if %d msg VHOST_USER_SET_LOG_BASE",
         vui->hw_if_index);
 
+      if (msg.size != sizeof(msg.log)) {
+        DBG_SOCK("invalid msg size for VHOST_USER_SET_LOG_BASE: %d instead of %d",
+                 msg.size, sizeof(msg.log));
+        goto close_socket;
+      }
+
+      if (!(vui->protocol_features & (1 << VHOST_USER_PROTOCOL_F_LOG_SHMFD))) {
+        DBG_SOCK("VHOST_USER_PROTOCOL_F_LOG_SHMFD not set but VHOST_USER_SET_LOG_BASE received");
+        goto close_socket;
+      }
+
+      fd = fds[0];
+      /* align size to 2M page */
+      long page_sz = get_huge_page_size(fd);
+      ssize_t map_sz = (msg.log.size + msg.log.offset + page_sz) & ~(page_sz - 1);
+
+      vui->log_base_addr = mmap(0, map_sz, PROT_READ | PROT_WRITE,
+                                MAP_SHARED, fd, 0);
+
+      DBG_SOCK("map log region addr 0 len 0x%lx off 0x%lx fd %d mapped 0x%lx",
+               map_sz, msg.log.offset, fd, vui->log_base_addr);
+
+      if (vui->log_base_addr == MAP_FAILED) {
+        clib_warning("failed to map memory. errno is %d", errno);
+        goto close_socket;
+      }
+
+      vui->log_base_addr += msg.log.offset;
+      vui->log_size = msg.log.size;
+
+      msg.flags |= 4;
+      msg.size = sizeof(msg.u64);
+
       break;
+    }
 
     case VHOST_USER_SET_LOG_FD:
       DBG_SOCK("if %d msg VHOST_USER_SET_LOG_FD",
@@ -532,6 +610,28 @@
 
       break;
 
+    case VHOST_USER_GET_PROTOCOL_FEATURES:
+      DBG_SOCK("if %d msg VHOST_USER_GET_PROTOCOL_FEATURES", vui->hw_if_index);
+
+      msg.flags |= 4;
+      msg.u64 = (1 << VHOST_USER_PROTOCOL_F_LOG_SHMFD);
+      msg.size = sizeof(msg.u64);
+      break;
+
+    case VHOST_USER_SET_PROTOCOL_FEATURES:
+      DBG_SOCK("if %d msg VHOST_USER_SET_PROTOCOL_FEATURES features 0x%lx",
+               vui->hw_if_index, msg.u64);
+
+      vui->protocol_features = msg.u64;
+
+      break;
+
+    case VHOST_USER_SET_VRING_ENABLE:
+      DBG_SOCK("if %d VHOST_USER_SET_VRING_ENABLE, enable: %d",
+               vui->hw_if_index, msg.state.num);
+      vui->vrings[msg.state.index].enabled = msg.state.num;
+      break;
+
     default:
       DBG_SOCK("unknown vhost-user message %d received. closing socket",
         msg.request);
@@ -750,6 +850,7 @@
     vq->int_deadline = vlib_time_now(vm) + vum->coalesce_time;
 }
 
+
 static u32 vhost_user_if_input ( vlib_main_t * vm,
                                vhost_user_main_t * vum, 
                                vhost_user_intf_t * vui,
@@ -770,7 +871,7 @@
   vec_reset_length (vui->d_trace_buffers);
 
   /* no descriptor ptr - bail out */
-  if (PREDICT_FALSE(!txvq->desc || !txvq->avail))
+  if (PREDICT_FALSE(!txvq->desc || !txvq->avail || !txvq->enabled))
     return 0;
 
   /* do we have pending intterupts ? */
@@ -799,6 +900,7 @@
     txvq->last_avail_idx = txvq->last_used_idx = txvq->avail->idx;
     CLIB_MEMORY_BARRIER();
     txvq->used->idx = txvq->last_used_idx;
+    vhost_user_log_dirty_ring(vui, txvq, idx);
     vhost_user_send_call(vm, txvq);
     return 0;
   }
@@ -849,6 +951,7 @@
       txvq->last_avail_idx++;
       txvq->used->ring[txvq->last_used_idx & qsz_mask].id = desc_chain_head;
       txvq->used->ring[txvq->last_used_idx & qsz_mask].len = 0;
+      vhost_user_log_dirty_ring(vui, txvq, ring[txvq->last_used_idx & qsz_mask]);
       txvq->last_used_idx++;
       flush--;
     }
@@ -914,6 +1017,7 @@
       txvq->last_avail_idx++;
       txvq->used->ring[txvq->last_used_idx & qsz_mask].id = desc_chain_head;
       txvq->used->ring[txvq->last_used_idx & qsz_mask].len = 0;
+      vhost_user_log_dirty_ring(vui, txvq, ring[txvq->last_used_idx & qsz_mask]);
       txvq->last_used_idx++;
 
       if(PREDICT_FALSE(b_head->current_length < 14 &&
@@ -957,6 +1061,7 @@
   /* give buffers back to driver */
   CLIB_MEMORY_BARRIER();
   txvq->used->idx = txvq->last_used_idx;
+  vhost_user_log_dirty_ring(vui, txvq, idx);
 
   if (PREDICT_FALSE (vec_len (vui->d_trace_buffers) > 0))
   {
@@ -1052,7 +1157,7 @@
   if (PREDICT_FALSE(!vui->is_up))
      goto done2;
 
-  if (PREDICT_FALSE(!rxvq->desc || !rxvq->avail || vui->sock_errno != 0)) {
+  if (PREDICT_FALSE(!rxvq->desc || !rxvq->avail || vui->sock_errno != 0 || !rxvq->enabled)) {
      error = VHOST_USER_TX_FUNC_ERROR_NOT_READY;
      goto done2;
   }
@@ -1111,6 +1216,8 @@
       hdr->hdr.flags = 0;
       hdr->hdr.gso_type = 0;
 
+      vhost_user_log_dirty_pages(vui, rxvq->desc[desc_current].addr, vui->virtio_net_hdr_sz);
+
       if (vui->virtio_net_hdr_sz == 12)
         hdr->num_buffers = 1;
 
@@ -1149,6 +1256,7 @@
             //Move from available to used buffer
             rxvq->used->ring[used_index & qsz_mask].id = desc_chain_head;
             rxvq->used->ring[used_index & qsz_mask].len = desc_len;
+            vhost_user_log_dirty_ring(vui, rxvq, ring[used_index & qsz_mask]);
             rxvq->last_avail_idx++;
             used_index++;
             hdr->num_buffers++;
@@ -1182,6 +1290,7 @@
         u16 bytes_to_copy = bytes_left > (rxvq->desc[desc_current].len - offset) ? (rxvq->desc[desc_current].len - offset) : bytes_left;
         rte_memcpy(buffer_addr, vlib_buffer_get_current (current_b0) + current_b0->current_length - bytes_left, bytes_to_copy);
 
+        vhost_user_log_dirty_pages(vui, rxvq->desc[desc_current].addr + offset, bytes_to_copy);
         bytes_left -= bytes_to_copy;
         offset += bytes_to_copy;
         buffer_addr += bytes_to_copy;
@@ -1191,6 +1300,8 @@
       //Move from available to used ring
       rxvq->used->ring[used_index & qsz_mask].id = desc_chain_head;
       rxvq->used->ring[used_index & qsz_mask].len = desc_len;
+      vhost_user_log_dirty_ring(vui, rxvq, ring[used_index & qsz_mask]);
+
       rxvq->last_avail_idx++;
       used_index++;
   }
@@ -1198,6 +1309,7 @@
 done:
   CLIB_MEMORY_BARRIER();
   rxvq->used->idx = used_index;
+  vhost_user_log_dirty_ring(vui, rxvq, idx);
 
   /* interrupt (call) handling */
   if((rxvq->callfd > 0) && !(rxvq->avail->flags & 1)) {
@@ -1473,6 +1585,7 @@
   vnet_sw_interface_t * sw;
   sw = vnet_get_hw_sw_interface (vnm, vui->hw_if_index);
   vlib_thread_main_t * tm = vlib_get_thread_main();
+  int q;
 
   vui->unix_fd = sockfd;
   vui->sw_if_index = sw->sw_if_index;
@@ -1484,6 +1597,11 @@
   vui->feature_mask = feature_mask;
   vui->active = 1;
   vui->unix_file_index = ~0;
+  vui->log_base_addr = 0;
+
+  for (q = 0; q < 2; q++) {
+    vui->vrings[q].enabled = 0;
+  }
 
   vnet_hw_interface_set_flags (vnm, vui->hw_if_index,  0);
 
diff --git a/vnet/vnet/devices/virtio/vhost-user.h b/vnet/vnet/devices/virtio/vhost-user.h
index bf3f6d5..83dbf3e 100644
--- a/vnet/vnet/devices/virtio/vhost-user.h
+++ b/vnet/vnet/devices/virtio/vhost-user.h
@@ -26,9 +26,11 @@
 #define VIRTQ_DESC_F_NEXT               1
 #define VHOST_USER_REPLY_MASK       (0x1 << 2)
 
+#define VHOST_USER_PROTOCOL_F_MQ   0
+#define VHOST_USER_PROTOCOL_F_LOG_SHMFD	1
+
 #if RTE_VERSION >= RTE_VERSION_NUM(2, 2, 0, 0)
 #define VHOST_USER_F_PROTOCOL_FEATURES  30
-#define VHOST_USER_PROTOCOL_F_MQ   0
 #define VHOST_USER_PROTOCOL_FEATURES   (1ULL << VHOST_USER_PROTOCOL_F_MQ)
 
 /* If multiqueue is provided by host, then we suppport it. */
@@ -40,7 +42,11 @@
 
 #define foreach_virtio_net_feature      \
  _ (VIRTIO_NET_F_MRG_RXBUF, 15)         \
- _ (VIRTIO_F_ANY_LAYOUT, 27)
+ _ (VIRTIO_F_ANY_LAYOUT, 27)            \
+ _ (VHOST_F_LOG_ALL, 26)                \
+ _ (VIRTIO_NET_F_GUEST_ANNOUNCE, 21)    \
+ _ (VHOST_USER_F_PROTOCOL_FEATURES, 30)
+
 
 typedef enum {
 #define _(f,n) FEAT_##f = (n),
@@ -80,6 +86,11 @@
   u64 desc_user_addr, used_user_addr, avail_user_addr, log_guest_addr;
 } vhost_vring_addr_t;
 
+typedef struct vhost_user_log {
+  u64 size;
+  u64 offset;
+} vhost_user_log_t;
+
 typedef enum vhost_user_req {
   VHOST_USER_NONE = 0,
   VHOST_USER_GET_FEATURES = 1,
@@ -96,12 +107,12 @@
   VHOST_USER_SET_VRING_KICK = 12,
   VHOST_USER_SET_VRING_CALL = 13,
   VHOST_USER_SET_VRING_ERR = 14,
-#if RTE_VERSION >= RTE_VERSION_NUM(2, 2, 0, 0)
   VHOST_USER_GET_PROTOCOL_FEATURES = 15,
   VHOST_USER_SET_PROTOCOL_FEATURES = 16,
+#if RTE_VERSION >= RTE_VERSION_NUM(2, 2, 0, 0)
   VHOST_USER_GET_QUEUE_NUM = 17,
-  VHOST_USER_SET_VRING_ENABLE = 18,
 #endif
+  VHOST_USER_SET_VRING_ENABLE = 18,
   VHOST_USER_MAX
 } vhost_user_req_t;
 
@@ -151,6 +162,7 @@
         vhost_vring_state_t state;
         vhost_vring_addr_t addr;
         vhost_user_memory_t memory;
+        vhost_user_log_t log;
     };
 } __attribute ((packed)) vhost_user_msg_t;
 
@@ -161,9 +173,11 @@
   vring_desc_t *desc;
   vring_avail_t *avail;
   vring_used_t *used;
+  u64 log_guest_addr;
   int callfd;
   int kickfd;
   int errfd;
+  u32 enabled;
   u32 callfd_idx;
   u32 n_since_last_int;
   f64 int_deadline;
@@ -186,6 +200,7 @@
   u32 nregions;
   u64 features;
   u64 feature_mask;
+  u64 protocol_features;
   u32 num_vrings;
   vhost_user_memory_region_t regions[VHOST_MEMORY_MAX_NREGIONS];
   void * region_mmap_addr[VHOST_MEMORY_MAX_NREGIONS];
@@ -194,6 +209,9 @@
   int virtio_net_hdr_sz;
   int is_any_layout;
   u32 * d_trace_buffers;
+
+  void * log_base_addr;
+  u64 log_size;
 } vhost_user_intf_t;
 
 typedef struct {