tun/tap: Bad packets sent to kernel via tun/tap interface
It was observed that under heavy traffic, VPP accidentally sent traffic
with the wrong source and destination to the tun/tap interface. Traffic
appears to be sent to the wrong direction. This problem is only
seen when worker thread is configured.
When worker thread is used, TX and RX may reside in different
core. Yet both TX and RX threads are sharing the same global variable,
namely iovecs without any mutex or memory barrier protection.
This creates a race condition when heavy traffic is blasted to VPP,
like 1000 pps.
We could create a mutex or memory barrier to ensure atomic memory access.
But why bother? It is a lot cheaper to just decouple the iovecs such
that TX and RX have their own iovecs.
Change-Id: I86a5a19bd8de54d54f32e1f0845bae6a81bbf686
Signed-off-by: Steven <sluong@cisco.com>
diff --git a/src/vnet/unix/tapcli.c b/src/vnet/unix/tapcli.c
index 13154b3..ce38609 100644
--- a/src/vnet/unix/tapcli.c
+++ b/src/vnet/unix/tapcli.c
@@ -99,8 +99,11 @@
* @brief TAPCLI main state struct
*/
typedef struct {
- /** Vector of iovecs for readv/writev calls. */
- struct iovec * iovecs;
+ /** Vector of iovecs for readv calls. */
+ struct iovec * rd_iovecs;
+
+ /** Vector of iovecs for writev calls. */
+ struct iovec * wr_iovecs;
/** Vector of VLIB rx buffers to use. We allocate them in blocks
of VLIB_FRAME_SIZE (256). */
@@ -199,11 +202,11 @@
ti = vec_elt_at_index (tm->tapcli_interfaces, p[0]);
/* Re-set iovecs if present. */
- if (tm->iovecs)
- _vec_len (tm->iovecs) = 0;
+ if (tm->wr_iovecs)
+ _vec_len (tm->wr_iovecs) = 0;
/* VLIB buffer chain -> Unix iovec(s). */
- vec_add2 (tm->iovecs, iov, 1);
+ vec_add2 (tm->wr_iovecs, iov, 1);
iov->iov_base = b->data + b->current_data;
iov->iov_len = l = b->current_length;
@@ -212,7 +215,7 @@
do {
b = vlib_get_buffer (vm, b->next_buffer);
- vec_add2 (tm->iovecs, iov, 1);
+ vec_add2 (tm->wr_iovecs, iov, 1);
iov->iov_base = b->data + b->current_data;
iov->iov_len = b->current_length;
@@ -220,7 +223,7 @@
} while (b->flags & VLIB_BUFFER_NEXT_PRESENT);
}
- if (writev (ti->unix_fd, tm->iovecs, vec_len (tm->iovecs)) < l)
+ if (writev (ti->unix_fd, tm->wr_iovecs, vec_len (tm->wr_iovecs)) < l)
clib_unix_warning ("writev");
}
@@ -292,14 +295,14 @@
/* Allocate RX buffers from end of rx_buffers.
Turn them into iovecs to pass to readv. */
- vec_validate (tm->iovecs, tm->mtu_buffers - 1);
+ vec_validate (tm->rd_iovecs, tm->mtu_buffers - 1);
for (j = 0; j < tm->mtu_buffers; j++) {
b = vlib_get_buffer (vm, tm->rx_buffers[i_rx - j]);
- tm->iovecs[j].iov_base = b->data;
- tm->iovecs[j].iov_len = buffer_size;
+ tm->rd_iovecs[j].iov_base = b->data;
+ tm->rd_iovecs[j].iov_len = buffer_size;
}
- n_bytes_left = readv (ti->unix_fd, tm->iovecs, tm->mtu_buffers);
+ n_bytes_left = readv (ti->unix_fd, tm->rd_iovecs, tm->mtu_buffers);
n_bytes_in_packet = n_bytes_left;
if (n_bytes_left <= 0) {
if (errno != EAGAIN) {
diff --git a/src/vnet/unix/tuntap.c b/src/vnet/unix/tuntap.c
index 9616feb..dc5c2a8 100644
--- a/src/vnet/unix/tuntap.c
+++ b/src/vnet/unix/tuntap.c
@@ -71,8 +71,11 @@
* @brief TUNTAP node main state
*/
typedef struct {
- /** Vector of iovecs for readv/writev calls. */
- struct iovec * iovecs;
+ /** Vector of iovecs for readv calls. */
+ struct iovec * rd_iovecs;
+
+ /** Vector of iovecs for writev calls. */
+ struct iovec * wr_iovecs;
/** Vector of VLIB rx buffers to use. We allocate them in blocks
of VLIB_FRAME_SIZE (256). */
@@ -160,11 +163,11 @@
}
/* Re-set iovecs if present. */
- if (tm->iovecs)
- _vec_len (tm->iovecs) = 0;
+ if (tm->wr_iovecs)
+ _vec_len (tm->wr_iovecs) = 0;
/** VLIB buffer chain -> Unix iovec(s). */
- vec_add2 (tm->iovecs, iov, 1);
+ vec_add2 (tm->wr_iovecs, iov, 1);
iov->iov_base = b->data + b->current_data;
iov->iov_len = l = b->current_length;
@@ -173,7 +176,7 @@
do {
b = vlib_get_buffer (vm, b->next_buffer);
- vec_add2 (tm->iovecs, iov, 1);
+ vec_add2 (tm->wr_iovecs, iov, 1);
iov->iov_base = b->data + b->current_data;
iov->iov_len = b->current_length;
@@ -181,7 +184,8 @@
} while (b->flags & VLIB_BUFFER_NEXT_PRESENT);
}
- if (writev (tm->dev_net_tun_fd, tm->iovecs, vec_len (tm->iovecs)) < l)
+ if (writev (tm->dev_net_tun_fd, tm->wr_iovecs,
+ vec_len (tm->wr_iovecs)) < l)
clib_unix_warning ("writev");
n_bytes += l;
@@ -256,15 +260,15 @@
/** We should have enough buffers left for an MTU sized packet. */
ASSERT (vec_len (tm->rx_buffers) >= tm->mtu_buffers);
- vec_validate (tm->iovecs, tm->mtu_buffers - 1);
+ vec_validate (tm->rd_iovecs, tm->mtu_buffers - 1);
for (i = 0; i < tm->mtu_buffers; i++)
{
b = vlib_get_buffer (vm, tm->rx_buffers[i_rx - i]);
- tm->iovecs[i].iov_base = b->data;
- tm->iovecs[i].iov_len = buffer_size;
+ tm->rd_iovecs[i].iov_base = b->data;
+ tm->rd_iovecs[i].iov_len = buffer_size;
}
- n_bytes_left = readv (tm->dev_net_tun_fd, tm->iovecs, tm->mtu_buffers);
+ n_bytes_left = readv (tm->dev_net_tun_fd, tm->rd_iovecs, tm->mtu_buffers);
n_bytes_in_packet = n_bytes_left;
if (n_bytes_left <= 0)
{