virtio: support virtio 1.1 packed ring in vhost
virtio 1.1 defines a number of new features. Packed ring is among the most
notable and important one. It combines used, available, and descripptor rings
into one.
This patch provides experimental support for packed ring. To avoid
regression, when packed ring is configured for the interface, it is branched
to a separate RX and TX driver. Non packed ring should continue to perform
as it was before.
Packed ring is tested using qemu4.2 and ubuntu focal fossa (kernel 5.4.0-12)
on the guess VM which supports packed ring.
To configure VPP with packed ring, just add the optional keyword "packed"
when creating the vhost interface. To bring up the guest VM with packed ring,
add "packed=on" in the qemu launch command.
To facilitate troubleshooting, also added "verbose" option in
show vhost desc CLI to include displaying the indirect descriptors.
Known qemu reconnect issue -
If VPP is restarted, guest VMs also need to be restarted. The problem
is kernel virtio-net-pci keeps track of the previous available and used
indices. For virtio 1.0, these indices are in shared memory and qemu can
easily copy them to pass to the backend for reconnect. For virio 1.1, these
indices are no longer in shared memory. Qemu needs a new mechanism to retrieve
them and it is not currently implemented. So when the protocol reconnects,
qemu does not have the correct available and used indices to pass to the
backend. As a result, after the reconnect, virtio-net-pci is reading the TX
ring from the wrong position in the ring, not the same position which the
backend is writing. Similar problem exists also in the RX.
Type: feature
Signed-off-by: Steven Luong <sluong@cisco.com>
Change-Id: I5afc50b0bafab5a1de7a6dd10f399db3fafd144c
diff --git a/src/vnet/devices/virtio/vhost_user.c b/src/vnet/devices/virtio/vhost_user.c
index 7094a00..d24e516 100644
--- a/src/vnet/devices/virtio/vhost_user.c
+++ b/src/vnet/devices/virtio/vhost_user.c
@@ -466,6 +466,8 @@
if (vui->enable_gso)
msg.u64 |= FEATURE_VIRTIO_NET_F_HOST_GUEST_TSO_FEATURE_BITS;
+ if (vui->enable_packed)
+ msg.u64 |= (1ULL << FEAT_VIRTIO_F_RING_PACKED);
msg.size = sizeof (msg.u64);
vu_log_debug (vui, "if %d msg VHOST_USER_GET_FEATURES - reply "
@@ -655,7 +657,11 @@
vui->vrings[msg.state.index].used->idx;
/* tell driver that we don't want interrupts */
- vui->vrings[msg.state.index].used->flags = VRING_USED_F_NO_NOTIFY;
+ if (vhost_user_is_packed_ring_supported (vui))
+ vui->vrings[msg.state.index].used_event->flags =
+ VRING_EVENT_F_DISABLE;
+ else
+ vui->vrings[msg.state.index].used->flags = VRING_USED_F_NO_NOTIFY;
vlib_worker_thread_barrier_release (vm);
vhost_user_update_iface_state (vui);
break;
@@ -762,10 +768,47 @@
break;
case VHOST_USER_SET_VRING_BASE:
- vu_log_debug (vui, "if %d msg VHOST_USER_SET_VRING_BASE idx %d num %d",
+ vu_log_debug (vui,
+ "if %d msg VHOST_USER_SET_VRING_BASE idx %d num 0x%x",
vui->hw_if_index, msg.state.index, msg.state.num);
vlib_worker_thread_barrier_sync (vm);
vui->vrings[msg.state.index].last_avail_idx = msg.state.num;
+ if (vhost_user_is_packed_ring_supported (vui))
+ {
+ /*
+ * 0 1 2 3
+ * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ * | last avail idx | | last used idx | |
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ * ^ ^
+ * | |
+ * avail wrap counter used wrap counter
+ */
+ /* last avail idx at bit 0-14. */
+ vui->vrings[msg.state.index].last_avail_idx =
+ msg.state.num & 0x7fff;
+ /* avail wrap counter at bit 15 */
+ vui->vrings[msg.state.index].avail_wrap_counter =
+ ! !(msg.state.num & (1 << 15));
+
+ /*
+ * Although last_used_idx is passed in the upper 16 bits in qemu
+ * implementation, in practice, last_avail_idx and last_used_idx are
+ * usually the same. As a result, DPDK does not bother to pass us
+ * last_used_idx. The spec is not clear on thex coding. I figured it
+ * out by reading the qemu code. So let's just read last_avail_idx
+ * and set last_used_idx equals to last_avail_idx.
+ */
+ vui->vrings[msg.state.index].last_used_idx =
+ vui->vrings[msg.state.index].last_avail_idx;
+ vui->vrings[msg.state.index].used_wrap_counter =
+ vui->vrings[msg.state.index].avail_wrap_counter;
+
+ if (vui->vrings[msg.state.index].avail_wrap_counter == 1)
+ vui->vrings[msg.state.index].avail_wrap_counter =
+ VIRTQ_DESC_F_AVAIL;
+ }
vlib_worker_thread_barrier_release (vm);
break;
@@ -784,6 +827,15 @@
* closing the vring also initializes the vring last_avail_idx
*/
msg.state.num = vui->vrings[msg.state.index].last_avail_idx;
+ if (vhost_user_is_packed_ring_supported (vui))
+ {
+ msg.state.num =
+ (vui->vrings[msg.state.index].last_avail_idx & 0x7fff) |
+ (! !vui->vrings[msg.state.index].avail_wrap_counter << 15);
+ msg.state.num |=
+ ((vui->vrings[msg.state.index].last_used_idx & 0x7fff) |
+ (! !vui->vrings[msg.state.index].used_wrap_counter << 15)) << 16;
+ }
msg.flags |= 4;
msg.size = sizeof (msg.state);
@@ -793,7 +845,8 @@
*/
vhost_user_vring_close (vui, msg.state.index);
vlib_worker_thread_barrier_release (vm);
- vu_log_debug (vui, "if %d msg VHOST_USER_GET_VRING_BASE idx %d num %d",
+ vu_log_debug (vui,
+ "if %d msg VHOST_USER_GET_VRING_BASE idx %d num 0x%x",
vui->hw_if_index, msg.state.index, msg.state.num);
n =
send (uf->file_descriptor, &msg, VHOST_USER_MSG_HDR_SZ + msg.size, 0);
@@ -1440,7 +1493,8 @@
vhost_user_intf_t * vui,
int server_sock_fd,
const char *sock_filename,
- u64 feature_mask, u32 * sw_if_index, u8 enable_gso)
+ u64 feature_mask, u32 * sw_if_index, u8 enable_gso,
+ u8 enable_packed)
{
vnet_sw_interface_t *sw;
int q;
@@ -1472,6 +1526,7 @@
vui->log_base_addr = 0;
vui->if_index = vui - vum->vhost_user_interfaces;
vui->enable_gso = enable_gso;
+ vui->enable_packed = enable_packed;
/*
* enable_gso takes precedence over configurable feature mask if there
* is a clash.
@@ -1519,7 +1574,7 @@
u32 * sw_if_index,
u64 feature_mask,
u8 renumber, u32 custom_dev_instance, u8 * hwaddr,
- u8 enable_gso)
+ u8 enable_gso, u8 enable_packed)
{
vhost_user_intf_t *vui = NULL;
u32 sw_if_idx = ~0;
@@ -1560,7 +1615,7 @@
vlib_worker_thread_barrier_release (vm);
vhost_user_vui_init (vnm, vui, server_sock_fd, sock_filename,
- feature_mask, &sw_if_idx, enable_gso);
+ feature_mask, &sw_if_idx, enable_gso, enable_packed);
vnet_sw_interface_set_mtu (vnm, vui->sw_if_index, 9000);
vhost_user_rx_thread_placement (vui, 1);
@@ -1582,7 +1637,7 @@
u8 is_server,
u32 sw_if_index,
u64 feature_mask, u8 renumber, u32 custom_dev_instance,
- u8 enable_gso)
+ u8 enable_gso, u8 enable_packed)
{
vhost_user_main_t *vum = &vhost_user_main;
vhost_user_intf_t *vui = NULL;
@@ -1619,7 +1674,8 @@
vhost_user_term_if (vui);
vhost_user_vui_init (vnm, vui, server_sock_fd,
- sock_filename, feature_mask, &sw_if_idx, enable_gso);
+ sock_filename, feature_mask, &sw_if_idx, enable_gso,
+ enable_packed);
if (renumber)
vnet_interface_name_renumber (sw_if_idx, custom_dev_instance);
@@ -1645,7 +1701,7 @@
u8 hwaddr[6];
u8 *hw = NULL;
clib_error_t *error = NULL;
- u8 enable_gso = 0;
+ u8 enable_gso = 0, enable_packed = 0;
/* Get a line of input. */
if (!unformat_user (input, unformat_line_input, line_input))
@@ -1653,6 +1709,8 @@
/* GSO feature is disable by default */
feature_mask &= ~FEATURE_VIRTIO_NET_F_HOST_GUEST_TSO_FEATURE_BITS;
+ /* packed-ring feature is disable by default */
+ feature_mask &= ~(1ULL << FEAT_VIRTIO_F_RING_PACKED);
while (unformat_check_input (line_input) != UNFORMAT_END_OF_INPUT)
{
if (unformat (line_input, "socket %s", &sock_filename))
@@ -1661,6 +1719,8 @@
is_server = 1;
else if (unformat (line_input, "gso"))
enable_gso = 1;
+ else if (unformat (line_input, "packed"))
+ enable_packed = 1;
else if (unformat (line_input, "feature-mask 0x%llx", &feature_mask))
;
else
@@ -1685,7 +1745,7 @@
if ((rv = vhost_user_create_if (vnm, vm, (char *) sock_filename,
is_server, &sw_if_index, feature_mask,
renumber, custom_dev_instance, hw,
- enable_gso)))
+ enable_gso, enable_packed)))
{
error = clib_error_return (0, "vhost_user_create_if returned %d", rv);
goto done;
@@ -1799,6 +1859,186 @@
return rv;
}
+static u8 *
+format_vhost_user_desc (u8 * s, va_list * args)
+{
+ char *fmt = va_arg (*args, char *);
+ vhost_user_intf_t *vui = va_arg (*args, vhost_user_intf_t *);
+ vring_desc_t *desc_table = va_arg (*args, vring_desc_t *);
+ int idx = va_arg (*args, int);
+ u32 *mem_hint = va_arg (*args, u32 *);
+
+ s = format (s, fmt, idx, desc_table[idx].addr, desc_table[idx].len,
+ desc_table[idx].flags, desc_table[idx].next,
+ pointer_to_uword (map_guest_mem (vui, desc_table[idx].addr,
+ mem_hint)));
+ return s;
+}
+
+static u8 *
+format_vhost_user_vring (u8 * s, va_list * args)
+{
+ char *fmt = va_arg (*args, char *);
+ vhost_user_intf_t *vui = va_arg (*args, vhost_user_intf_t *);
+ int q = va_arg (*args, int);
+
+ s = format (s, fmt, vui->vrings[q].avail->flags, vui->vrings[q].avail->idx,
+ vui->vrings[q].used->flags, vui->vrings[q].used->idx);
+ return s;
+}
+
+static void
+vhost_user_show_fds (vlib_main_t * vm, vhost_user_intf_t * vui, int q)
+{
+ int kickfd = UNIX_GET_FD (vui->vrings[q].kickfd_idx);
+ int callfd = UNIX_GET_FD (vui->vrings[q].callfd_idx);
+
+ vlib_cli_output (vm, " kickfd %d callfd %d errfd %d\n", kickfd, callfd,
+ vui->vrings[q].errfd);
+}
+
+static void
+vhost_user_show_desc (vlib_main_t * vm, vhost_user_intf_t * vui, int q,
+ int show_descr, int show_verbose)
+{
+ int j;
+ u32 mem_hint = 0;
+ u32 idx;
+ u32 n_entries;
+ vring_desc_t *desc_table;
+
+ if (vui->vrings[q].avail && vui->vrings[q].used)
+ vlib_cli_output (vm, "%U", format_vhost_user_vring,
+ " avail.flags %x avail.idx %d used.flags %x used.idx %d\n",
+ vui, q);
+
+ vhost_user_show_fds (vm, vui, q);
+
+ if (show_descr)
+ {
+ vlib_cli_output (vm, "\n descriptor table:\n");
+ vlib_cli_output (vm,
+ " slot addr len flags next "
+ "user_addr\n");
+ vlib_cli_output (vm,
+ " ===== ================== ===== ====== ===== "
+ "==================\n");
+ for (j = 0; j < vui->vrings[q].qsz_mask + 1; j++)
+ {
+ desc_table = vui->vrings[q].desc;
+ vlib_cli_output (vm, "%U", format_vhost_user_desc,
+ " %-5d 0x%016lx %-5d 0x%04x %-5d 0x%016lx\n", vui,
+ desc_table, j, &mem_hint);
+ if (show_verbose && (desc_table[j].flags & VIRTQ_DESC_F_INDIRECT))
+ {
+ n_entries = desc_table[j].len / sizeof (vring_desc_t);
+ desc_table = map_guest_mem (vui, desc_table[j].addr, &mem_hint);
+ if (desc_table)
+ {
+ for (idx = 0; idx < clib_min (20, n_entries); idx++)
+ {
+ vlib_cli_output
+ (vm, "%U", format_vhost_user_desc,
+ "> %-4u 0x%016lx %-5u 0x%04x %-5u 0x%016lx\n", vui,
+ desc_table, idx, &mem_hint);
+ }
+ if (n_entries >= 20)
+ vlib_cli_output (vm, "Skip displaying entries 20...%u\n",
+ n_entries);
+ }
+ }
+ }
+ }
+}
+
+static u8 *
+format_vhost_user_packed_desc (u8 * s, va_list * args)
+{
+ char *fmt = va_arg (*args, char *);
+ vhost_user_intf_t *vui = va_arg (*args, vhost_user_intf_t *);
+ vring_packed_desc_t *desc_table = va_arg (*args, vring_packed_desc_t *);
+ int idx = va_arg (*args, int);
+ u32 *mem_hint = va_arg (*args, u32 *);
+
+ s = format (s, fmt, idx, desc_table[idx].addr, desc_table[idx].len,
+ desc_table[idx].flags, desc_table[idx].id,
+ pointer_to_uword (map_guest_mem (vui, desc_table[idx].addr,
+ mem_hint)));
+ return s;
+}
+
+static u8 *
+format_vhost_user_vring_packed (u8 * s, va_list * args)
+{
+ char *fmt = va_arg (*args, char *);
+ vhost_user_intf_t *vui = va_arg (*args, vhost_user_intf_t *);
+ int q = va_arg (*args, int);
+
+ s = format (s, fmt, vui->vrings[q].avail_event->flags,
+ vui->vrings[q].avail_event->off_wrap,
+ vui->vrings[q].used_event->flags,
+ vui->vrings[q].used_event->off_wrap,
+ vui->vrings[q].avail_wrap_counter,
+ vui->vrings[q].used_wrap_counter);
+ return s;
+}
+
+static void
+vhost_user_show_desc_packed (vlib_main_t * vm, vhost_user_intf_t * vui, int q,
+ int show_descr, int show_verbose)
+{
+ int j;
+ u32 mem_hint = 0;
+ u32 idx;
+ u32 n_entries;
+ vring_packed_desc_t *desc_table;
+
+ if (vui->vrings[q].avail_event && vui->vrings[q].used_event)
+ vlib_cli_output (vm, "%U", format_vhost_user_vring_packed,
+ " avail_event.flags %x avail_event.off_wrap %u "
+ "used_event.flags %x used_event.off_wrap %u\n"
+ " avail wrap counter %u, used wrap counter %u\n",
+ vui, q);
+
+ vhost_user_show_fds (vm, vui, q);
+
+ if (show_descr)
+ {
+ vlib_cli_output (vm, "\n descriptor table:\n");
+ vlib_cli_output (vm,
+ " slot addr len flags id "
+ "user_addr\n");
+ vlib_cli_output (vm,
+ " ===== ================== ===== ====== ===== "
+ "==================\n");
+ for (j = 0; j < vui->vrings[q].qsz_mask + 1; j++)
+ {
+ desc_table = vui->vrings[q].packed_desc;
+ vlib_cli_output (vm, "%U", format_vhost_user_packed_desc,
+ " %-5u 0x%016lx %-5u 0x%04x %-5u 0x%016lx\n", vui,
+ desc_table, j, &mem_hint);
+ if (show_verbose && (desc_table[j].flags & VIRTQ_DESC_F_INDIRECT))
+ {
+ n_entries = desc_table[j].len >> 4;
+ desc_table = map_guest_mem (vui, desc_table[j].addr, &mem_hint);
+ if (desc_table)
+ {
+ for (idx = 0; idx < clib_min (20, n_entries); idx++)
+ {
+ vlib_cli_output
+ (vm, "%U", format_vhost_user_packed_desc,
+ "> %-4u 0x%016lx %-5u 0x%04x %-5u 0x%016lx\n", vui,
+ desc_table, idx, &mem_hint);
+ }
+ if (n_entries >= 20)
+ vlib_cli_output (vm, "Skip displaying entries 20...%u\n",
+ n_entries);
+ }
+ }
+ }
+ }
+}
+
clib_error_t *
show_vhost_user_command_fn (vlib_main_t * vm,
unformat_input_t * input,
@@ -1814,6 +2054,7 @@
u32 ci;
int i, j, q;
int show_descr = 0;
+ int show_verbose = 0;
struct feat_struct
{
u8 bit;
@@ -1855,6 +2096,8 @@
}
else if (unformat (input, "descriptors") || unformat (input, "desc"))
show_descr = 1;
+ else if (unformat (input, "verbose"))
+ show_verbose = 1;
else
{
error = clib_error_return (0, "unknown input `%U'",
@@ -1884,6 +2127,8 @@
hw_if_indices[i]);
if (vui->enable_gso)
vlib_cli_output (vm, " GSO enable");
+ if (vui->enable_packed)
+ vlib_cli_output (vm, " Packed ring enable");
vlib_cli_output (vm, "virtio_net_hdr_sz %d\n"
" features mask (0x%llx): \n"
@@ -1985,41 +2230,11 @@
vui->vrings[q].last_avail_idx,
vui->vrings[q].last_used_idx);
- if (vui->vrings[q].avail && vui->vrings[q].used)
- vlib_cli_output (vm,
- " avail.flags %x avail.idx %d used.flags %x used.idx %d\n",
- vui->vrings[q].avail->flags,
- vui->vrings[q].avail->idx,
- vui->vrings[q].used->flags,
- vui->vrings[q].used->idx);
-
- int kickfd = UNIX_GET_FD (vui->vrings[q].kickfd_idx);
- int callfd = UNIX_GET_FD (vui->vrings[q].callfd_idx);
- vlib_cli_output (vm, " kickfd %d callfd %d errfd %d\n",
- kickfd, callfd, vui->vrings[q].errfd);
-
- if (show_descr)
- {
- vlib_cli_output (vm, "\n descriptor table:\n");
- vlib_cli_output (vm,
- " id addr len flags next user_addr\n");
- vlib_cli_output (vm,
- " ===== ================== ===== ====== ===== ==================\n");
- for (j = 0; j < vui->vrings[q].qsz_mask + 1; j++)
- {
- u32 mem_hint = 0;
- vlib_cli_output (vm,
- " %-5d 0x%016lx %-5d 0x%04x %-5d 0x%016lx\n",
- j, vui->vrings[q].desc[j].addr,
- vui->vrings[q].desc[j].len,
- vui->vrings[q].desc[j].flags,
- vui->vrings[q].desc[j].next,
- pointer_to_uword (map_guest_mem
- (vui,
- vui->vrings[q].desc[j].
- addr, &mem_hint)));
- }
- }
+ if (vhost_user_is_packed_ring_supported (vui))
+ vhost_user_show_desc_packed (vm, vui, q, show_descr,
+ show_verbose);
+ else
+ vhost_user_show_desc (vm, vui, q, show_descr, show_verbose);
}
vlib_cli_output (vm, "\n");
}
@@ -2090,7 +2305,8 @@
VLIB_CLI_COMMAND (vhost_user_connect_command, static) = {
.path = "create vhost-user",
.short_help = "create vhost-user socket <socket-filename> [server] "
- "[feature-mask <hex>] [hwaddr <mac-addr>] [renumber <dev_instance>] [gso]",
+ "[feature-mask <hex>] [hwaddr <mac-addr>] [renumber <dev_instance>] [gso] "
+ "[packed]",
.function = vhost_user_connect_command_fn,
.is_mp_safe = 1,
};
@@ -2251,7 +2467,8 @@
/* *INDENT-OFF* */
VLIB_CLI_COMMAND (show_vhost_user_command, static) = {
.path = "show vhost-user",
- .short_help = "show vhost-user [<interface> [<interface> [..]]] [descriptors]",
+ .short_help = "show vhost-user [<interface> [<interface> [..]]] "
+ "[[descriptors] [verbose]]",
.function = show_vhost_user_command_fn,
};
/* *INDENT-ON* */