blob: 6ff076c71f54e5c65d6dc56fcae2dbbd94abbc5a [file] [log] [blame]
Damjan Marion0f60ff82017-03-30 17:58:42 +02001From 7ca5c8de65acabe4cb60960adcfa9247efdd2a5c Mon Sep 17 00:00:00 2001
2From: Yongseok Koh <yskoh@mellanox.com>
3Date: Wed, 15 Mar 2017 16:55:44 -0700
4Subject: [PATCH] net/mlx5: add enhanced multi-packet send for ConnectX-5
5
6ConnectX-5 supports enhanced version of multi-packet send (MPS). An MPS Tx
7descriptor can carry multiple packets either by including pointers of
8packets or by inlining packets. Inlining packet data can be helpful to
9better utilize PCIe bandwidth. In addition, Enhanced MPS supports hybrid
10mode - mixing inlined packets and pointers in a descriptor. This feature is
11enabled by default if supported by HW.
12
13Signed-off-by: Yongseok Koh <yskoh@mellanox.com>
14---
15 doc/guides/nics/mlx5.rst | 31 +++-
16 drivers/net/mlx5/mlx5.c | 37 +++-
17 drivers/net/mlx5/mlx5.h | 4 +-
18 drivers/net/mlx5/mlx5_defs.h | 7 +
19 drivers/net/mlx5/mlx5_ethdev.c | 6 +-
20 drivers/net/mlx5/mlx5_prm.h | 20 ++
21 drivers/net/mlx5/mlx5_rxtx.c | 410 +++++++++++++++++++++++++++++++++++++++++
22 drivers/net/mlx5/mlx5_rxtx.h | 7 +-
23 drivers/net/mlx5/mlx5_txq.c | 29 ++-
24 9 files changed, 534 insertions(+), 17 deletions(-)
25
26diff --git a/doc/guides/nics/mlx5.rst b/doc/guides/nics/mlx5.rst
27index 41f3a47..0783aeb 100644
28--- a/doc/guides/nics/mlx5.rst
29+++ b/doc/guides/nics/mlx5.rst
30@@ -183,10 +183,17 @@ Run-time configuration
31
32 - ``txq_mpw_en`` parameter [int]
33
34- A nonzero value enables multi-packet send. This feature allows the TX
35- burst function to pack up to five packets in two descriptors in order to
36- save PCI bandwidth and improve performance at the cost of a slightly
37- higher CPU usage.
38+ A nonzero value enables multi-packet send (MPS) for ConnectX-4 Lx and
39+ enhanced multi-packet send (Enhanced MPS) for ConnectX-5. MPS allows the
40+ TX burst function to pack up multiple packets in a single descriptor
41+ session in order to save PCI bandwidth and improve performance at the
42+ cost of a slightly higher CPU usage. When ``txq_inline`` is set along
43+ with ``txq_mpw_en``, TX burst function tries to copy entire packet data
44+ on to TX descriptor instead of including pointer of packet only if there
45+ is enough room remained in the descriptor. ``txq_inline`` sets
46+ per-descriptor space for either pointers or inlined packets. In addition,
47+ Enhanced MPS supports hybrid mode - mixing inlined packets and pointers
48+ in the same descriptor.
49
50 This option cannot be used in conjunction with ``tso`` below. When ``tso``
51 is set, ``txq_mpw_en`` is disabled.
52@@ -194,6 +201,22 @@ Run-time configuration
53 It is currently only supported on the ConnectX-4 Lx and ConnectX-5
54 families of adapters. Enabled by default.
55
56+- ``txq_mpw_hdr_dseg_en`` parameter [int]
57+
58+ A nonzero value enables including two pointers in the first block of TX
59+ descriptor. This can be used to lessen CPU load for memory copy.
60+
61+ Effective only when Enhanced MPS is supported. Disabled by default.
62+
63+- ``txq_max_inline_len`` parameter [int]
64+
65+ Maximum size of packet to be inlined. This limits the size of packet to
66+ be inlined. If the size of a packet is larger than configured value, the
67+ packet isn't inlined even though there's enough space remained in the
68+ descriptor. Instead, the packet is included with pointer.
69+
70+ Effective only when Enhanced MPS is supported. The default value is 256.
71+
72 - ``tso`` parameter [int]
73
74 A nonzero value enables hardware TSO.
75diff --git a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c
76index ebc7984..bc6a34f 100644
77--- a/drivers/net/mlx5/mlx5.c
78+++ b/drivers/net/mlx5/mlx5.c
79@@ -84,6 +84,12 @@
80 /* Device parameter to enable multi-packet send WQEs. */
81 #define MLX5_TXQ_MPW_EN "txq_mpw_en"
82
83+/* Device parameter to include 2 dsegs in the title WQEBB. */
84+#define MLX5_TXQ_MPW_HDR_DSEG_EN "txq_mpw_hdr_dseg_en"
85+
86+/* Device parameter to limit the size of inlining packet. */
87+#define MLX5_TXQ_MAX_INLINE_LEN "txq_max_inline_len"
88+
89 /* Device parameter to enable hardware TSO offload. */
90 #define MLX5_TSO "tso"
91
92@@ -294,7 +300,11 @@ mlx5_args_check(const char *key, const char *val, void *opaque)
93 } else if (strcmp(MLX5_TXQS_MIN_INLINE, key) == 0) {
94 priv->txqs_inline = tmp;
95 } else if (strcmp(MLX5_TXQ_MPW_EN, key) == 0) {
96- priv->mps &= !!tmp; /* Enable MPW only if HW supports */
97+ priv->mps = !!tmp ? priv->mps : MLX5_MPW_DISABLED;
98+ } else if (strcmp(MLX5_TXQ_MPW_HDR_DSEG_EN, key) == 0) {
99+ priv->mpw_hdr_dseg = !!tmp;
100+ } else if (strcmp(MLX5_TXQ_MAX_INLINE_LEN, key) == 0) {
101+ priv->inline_max_packet_sz = tmp;
102 } else if (strcmp(MLX5_TSO, key) == 0) {
103 priv->tso = !!tmp;
104 } else {
105@@ -323,6 +333,8 @@ mlx5_args(struct priv *priv, struct rte_devargs *devargs)
106 MLX5_TXQ_INLINE,
107 MLX5_TXQS_MIN_INLINE,
108 MLX5_TXQ_MPW_EN,
109+ MLX5_TXQ_MPW_HDR_DSEG_EN,
110+ MLX5_TXQ_MAX_INLINE_LEN,
111 MLX5_TSO,
112 NULL,
113 };
114@@ -434,24 +446,27 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv, struct rte_pci_device *pci_dev)
115 switch (pci_dev->id.device_id) {
116 case PCI_DEVICE_ID_MELLANOX_CONNECTX4:
117 tunnel_en = 1;
118- mps = 0;
119+ mps = MLX5_MPW_DISABLED;
120 break;
121 case PCI_DEVICE_ID_MELLANOX_CONNECTX4LX:
122+ mps = MLX5_MPW;
123+ break;
124 case PCI_DEVICE_ID_MELLANOX_CONNECTX5:
125 case PCI_DEVICE_ID_MELLANOX_CONNECTX5VF:
126 case PCI_DEVICE_ID_MELLANOX_CONNECTX5EX:
127 case PCI_DEVICE_ID_MELLANOX_CONNECTX5EXVF:
128- mps = 1;
129 tunnel_en = 1;
130+ mps = MLX5_MPW_ENHANCED;
131 break;
132 default:
133- mps = 0;
134+ mps = MLX5_MPW_DISABLED;
135 }
136 INFO("PCI information matches, using device \"%s\""
137- " (SR-IOV: %s, MPS: %s)",
138+ " (SR-IOV: %s, %sMPS: %s)",
139 list[i]->name,
140 sriov ? "true" : "false",
141- mps ? "true" : "false");
142+ mps == MLX5_MPW_ENHANCED ? "Enhanced " : "",
143+ mps != MLX5_MPW_DISABLED ? "true" : "false");
144 attr_ctx = ibv_open_device(list[i]);
145 err = errno;
146 break;
147@@ -546,6 +561,13 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv, struct rte_pci_device *pci_dev)
148 priv->pd = pd;
149 priv->mtu = ETHER_MTU;
150 priv->mps = mps; /* Enable MPW by default if supported. */
151+ /* Set default values for Enhanced MPW, a.k.a MPWv2. */
152+ if (mps == MLX5_MPW_ENHANCED) {
153+ priv->mpw_hdr_dseg = 0;
154+ priv->txqs_inline = MLX5_EMPW_MIN_TXQS;
155+ priv->inline_max_packet_sz = MLX5_EMPW_MAX_INLINE_LEN;
156+ priv->txq_inline = MLX5_WQE_SIZE_MAX - MLX5_WQE_SIZE;
157+ }
158 priv->cqe_comp = 1; /* Enable compression by default. */
159 priv->tunnel_en = tunnel_en;
160 err = mlx5_args(priv, pci_dev->device.devargs);
161@@ -613,6 +635,9 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv, struct rte_pci_device *pci_dev)
162 "with TSO. MPS disabled");
163 priv->mps = 0;
164 }
165+ INFO("%sMPS is %s",
166+ priv->mps == MLX5_MPW_ENHANCED ? "Enhanced " : "",
167+ priv->mps != MLX5_MPW_DISABLED ? "enabled" : "disabled");
168 /* Allocate and register default RSS hash keys. */
169 priv->rss_conf = rte_calloc(__func__, hash_rxq_init_n,
170 sizeof((*priv->rss_conf)[0]), 0);
171diff --git a/drivers/net/mlx5/mlx5.h b/drivers/net/mlx5/mlx5.h
172index 870e01f..d26d465 100644
173--- a/drivers/net/mlx5/mlx5.h
174+++ b/drivers/net/mlx5/mlx5.h
175@@ -123,7 +123,8 @@ struct priv {
176 unsigned int hw_fcs_strip:1; /* FCS stripping is supported. */
177 unsigned int hw_padding:1; /* End alignment padding is supported. */
178 unsigned int sriov:1; /* This is a VF or PF with VF devices. */
179- unsigned int mps:1; /* Whether multi-packet send is supported. */
180+ unsigned int mps:2; /* Multi-packet send mode (0: disabled). */
181+ unsigned int mpw_hdr_dseg:1; /* Enable DSEGs in the title WQEBB. */
182 unsigned int cqe_comp:1; /* Whether CQE compression is enabled. */
183 unsigned int pending_alarm:1; /* An alarm is pending. */
184 unsigned int tso:1; /* Whether TSO is supported. */
185@@ -132,6 +133,7 @@ struct priv {
186 unsigned int max_tso_payload_sz; /* Maximum TCP payload for TSO. */
187 unsigned int txq_inline; /* Maximum packet size for inlining. */
188 unsigned int txqs_inline; /* Queue number threshold for inlining. */
189+ unsigned int inline_max_packet_sz; /* Max packet size for inlining. */
190 /* RX/TX queues. */
191 unsigned int rxqs_n; /* RX queues array size. */
192 unsigned int txqs_n; /* TX queues array size. */
193diff --git a/drivers/net/mlx5/mlx5_defs.h b/drivers/net/mlx5/mlx5_defs.h
194index eecb908..201bb33 100644
195--- a/drivers/net/mlx5/mlx5_defs.h
196+++ b/drivers/net/mlx5/mlx5_defs.h
197@@ -55,6 +55,13 @@
198 #define MLX5_TX_COMP_THRESH 32
199
200 /*
201+ * Request TX completion every time the total number of WQEBBs used for inlining
202+ * packets exceeds the size of WQ divided by this divisor. Better to be power of
203+ * two for performance.
204+ */
205+#define MLX5_TX_COMP_THRESH_INLINE_DIV (1 << 3)
206+
207+/*
208 * Maximum number of cached Memory Pools (MPs) per TX queue. Each RTE MP
209 * from which buffers are to be transmitted will have to be mapped by this
210 * driver to their own Memory Region (MR). This is a slow operation.
211diff --git a/drivers/net/mlx5/mlx5_ethdev.c b/drivers/net/mlx5/mlx5_ethdev.c
212index 5deb6e8..dd5fe5c 100644
213--- a/drivers/net/mlx5/mlx5_ethdev.c
214+++ b/drivers/net/mlx5/mlx5_ethdev.c
215@@ -1590,7 +1590,11 @@ priv_select_tx_function(struct priv *priv)
216 {
217 priv->dev->tx_pkt_burst = mlx5_tx_burst;
218 /* Select appropriate TX function. */
219- if (priv->mps && priv->txq_inline) {
220+ if (priv->mps == MLX5_MPW_ENHANCED) {
221+ priv->dev->tx_pkt_burst =
222+ mlx5_tx_burst_empw;
223+ DEBUG("selected Enhanced MPW TX function");
224+ } else if (priv->mps && priv->txq_inline) {
225 priv->dev->tx_pkt_burst = mlx5_tx_burst_mpw_inline;
226 DEBUG("selected MPW inline TX function");
227 } else if (priv->mps) {
228diff --git a/drivers/net/mlx5/mlx5_prm.h b/drivers/net/mlx5/mlx5_prm.h
229index 0a77f5b..155bdba 100644
230--- a/drivers/net/mlx5/mlx5_prm.h
231+++ b/drivers/net/mlx5/mlx5_prm.h
232@@ -73,6 +73,9 @@
233 /* WQE size */
234 #define MLX5_WQE_SIZE (4 * MLX5_WQE_DWORD_SIZE)
235
236+/* Max size of a WQE session. */
237+#define MLX5_WQE_SIZE_MAX 960U
238+
239 /* Compute the number of DS. */
240 #define MLX5_WQE_DS(n) \
241 (((n) + MLX5_WQE_DWORD_SIZE - 1) / MLX5_WQE_DWORD_SIZE)
242@@ -80,10 +83,19 @@
243 /* Room for inline data in multi-packet WQE. */
244 #define MLX5_MWQE64_INL_DATA 28
245
246+/* Default minimum number of Tx queues for inlining packets. */
247+#define MLX5_EMPW_MIN_TXQS 8
248+
249+/* Default max packet length to be inlined. */
250+#define MLX5_EMPW_MAX_INLINE_LEN (4U * MLX5_WQE_SIZE)
251+
252 #ifndef HAVE_VERBS_MLX5_OPCODE_TSO
253 #define MLX5_OPCODE_TSO MLX5_OPCODE_LSO_MPW /* Compat with OFED 3.3. */
254 #endif
255
256+#define MLX5_OPC_MOD_ENHANCED_MPSW 0
257+#define MLX5_OPCODE_ENHANCED_MPSW 0x29
258+
259 /* CQE value to inform that VLAN is stripped. */
260 #define MLX5_CQE_VLAN_STRIPPED (1u << 0)
261
262@@ -176,10 +188,18 @@ struct mlx5_wqe64 {
263 uint8_t raw[32];
264 } __rte_aligned(MLX5_WQE_SIZE);
265
266+/* MPW mode. */
267+enum mlx5_mpw_mode {
268+ MLX5_MPW_DISABLED,
269+ MLX5_MPW,
270+ MLX5_MPW_ENHANCED, /* Enhanced Multi-Packet Send WQE, a.k.a MPWv2. */
271+};
272+
273 /* MPW session status. */
274 enum mlx5_mpw_state {
275 MLX5_MPW_STATE_OPENED,
276 MLX5_MPW_INL_STATE_OPENED,
277+ MLX5_MPW_ENHANCED_STATE_OPENED,
278 MLX5_MPW_STATE_CLOSED,
279 };
280
281diff --git a/drivers/net/mlx5/mlx5_rxtx.c b/drivers/net/mlx5/mlx5_rxtx.c
282index 9fc433e..a1dd84a 100644
283--- a/drivers/net/mlx5/mlx5_rxtx.c
284+++ b/drivers/net/mlx5/mlx5_rxtx.c
285@@ -195,6 +195,62 @@ tx_mlx5_wqe(struct txq *txq, uint16_t ci)
286 }
287
288 /**
289+ * Return the size of tailroom of WQ.
290+ *
291+ * @param txq
292+ * Pointer to TX queue structure.
293+ * @param addr
294+ * Pointer to tail of WQ.
295+ *
296+ * @return
297+ * Size of tailroom.
298+ */
299+static inline size_t
300+tx_mlx5_wq_tailroom(struct txq *txq, void *addr)
301+{
302+ size_t tailroom;
303+ tailroom = (uintptr_t)(txq->wqes) +
304+ (1 << txq->wqe_n) * MLX5_WQE_SIZE -
305+ (uintptr_t)addr;
306+ return tailroom;
307+}
308+
309+/**
310+ * Copy data to tailroom of circular queue.
311+ *
312+ * @param dst
313+ * Pointer to destination.
314+ * @param src
315+ * Pointer to source.
316+ * @param n
317+ * Number of bytes to copy.
318+ * @param base
319+ * Pointer to head of queue.
320+ * @param tailroom
321+ * Size of tailroom from dst.
322+ *
323+ * @return
324+ * Pointer after copied data.
325+ */
326+static inline void *
327+mlx5_copy_to_wq(void *dst, const void *src, size_t n,
328+ void *base, size_t tailroom)
329+{
330+ void *ret;
331+
332+ if (n > tailroom) {
333+ rte_memcpy(dst, src, tailroom);
334+ rte_memcpy(base, (void *)((uintptr_t)src + tailroom),
335+ n - tailroom);
336+ ret = (uint8_t *)base + n - tailroom;
337+ } else {
338+ rte_memcpy(dst, src, n);
339+ ret = (n == tailroom) ? base : (uint8_t *)dst + n;
340+ }
341+ return ret;
342+}
343+
344+/**
345 * Manage TX completions.
346 *
347 * When sending a burst, mlx5_tx_burst() posts several WRs.
348@@ -1269,6 +1325,360 @@ mlx5_tx_burst_mpw_inline(void *dpdk_txq, struct rte_mbuf **pkts,
349 }
350
351 /**
352+ * Open an Enhanced MPW session.
353+ *
354+ * @param txq
355+ * Pointer to TX queue structure.
356+ * @param mpw
357+ * Pointer to MPW session structure.
358+ * @param length
359+ * Packet length.
360+ */
361+static inline void
362+mlx5_empw_new(struct txq *txq, struct mlx5_mpw *mpw, int padding)
363+{
364+ uint16_t idx = txq->wqe_ci & ((1 << txq->wqe_n) - 1);
365+
366+ mpw->state = MLX5_MPW_ENHANCED_STATE_OPENED;
367+ mpw->pkts_n = 0;
368+ mpw->total_len = sizeof(struct mlx5_wqe);
369+ mpw->wqe = (volatile struct mlx5_wqe *)tx_mlx5_wqe(txq, idx);
370+ mpw->wqe->ctrl[0] = htonl((MLX5_OPC_MOD_ENHANCED_MPSW << 24) |
371+ (txq->wqe_ci << 8) |
372+ MLX5_OPCODE_ENHANCED_MPSW);
373+ mpw->wqe->ctrl[2] = 0;
374+ mpw->wqe->ctrl[3] = 0;
375+ memset((void *)(uintptr_t)&mpw->wqe->eseg, 0, MLX5_WQE_DWORD_SIZE);
376+ if (unlikely(padding)) {
377+ uintptr_t addr = (uintptr_t)(mpw->wqe + 1);
378+
379+ /* Pad the first 2 DWORDs with zero-length inline header. */
380+ *(volatile uint32_t *)addr = htonl(MLX5_INLINE_SEG);
381+ *(volatile uint32_t *)(addr + MLX5_WQE_DWORD_SIZE) =
382+ htonl(MLX5_INLINE_SEG);
383+ mpw->total_len += 2 * MLX5_WQE_DWORD_SIZE;
384+ /* Start from the next WQEBB. */
385+ mpw->data.raw = (volatile void *)(tx_mlx5_wqe(txq, idx + 1));
386+ } else {
387+ mpw->data.raw = (volatile void *)(mpw->wqe + 1);
388+ }
389+}
390+
391+/**
392+ * Close an Enhanced MPW session.
393+ *
394+ * @param txq
395+ * Pointer to TX queue structure.
396+ * @param mpw
397+ * Pointer to MPW session structure.
398+ *
399+ * @return
400+ * Number of consumed WQEs.
401+ */
402+static inline uint16_t
403+mlx5_empw_close(struct txq *txq, struct mlx5_mpw *mpw)
404+{
405+ uint16_t ret;
406+
407+ /* Store size in multiple of 16 bytes. Control and Ethernet segments
408+ * count as 2.
409+ */
410+ mpw->wqe->ctrl[1] = htonl(txq->qp_num_8s | MLX5_WQE_DS(mpw->total_len));
411+ mpw->state = MLX5_MPW_STATE_CLOSED;
412+ ret = (mpw->total_len + (MLX5_WQE_SIZE - 1)) / MLX5_WQE_SIZE;
413+ txq->wqe_ci += ret;
414+ return ret;
415+}
416+
417+/**
418+ * DPDK callback for TX with Enhanced MPW support.
419+ *
420+ * @param dpdk_txq
421+ * Generic pointer to TX queue structure.
422+ * @param[in] pkts
423+ * Packets to transmit.
424+ * @param pkts_n
425+ * Number of packets in array.
426+ *
427+ * @return
428+ * Number of packets successfully transmitted (<= pkts_n).
429+ */
430+uint16_t
431+mlx5_tx_burst_empw(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
432+{
433+ struct txq *txq = (struct txq *)dpdk_txq;
434+ uint16_t elts_head = txq->elts_head;
435+ const unsigned int elts_n = 1 << txq->elts_n;
436+ unsigned int i = 0;
437+ unsigned int j = 0;
438+ unsigned int max_elts;
439+ uint16_t max_wqe;
440+ unsigned int max_inline = txq->max_inline * RTE_CACHE_LINE_SIZE;
441+ unsigned int mpw_room = 0;
442+ unsigned int inl_pad = 0;
443+ uint32_t inl_hdr;
444+ struct mlx5_mpw mpw = {
445+ .state = MLX5_MPW_STATE_CLOSED,
446+ };
447+
448+ if (unlikely(!pkts_n))
449+ return 0;
450+ /* Start processing. */
451+ txq_complete(txq);
452+ max_elts = (elts_n - (elts_head - txq->elts_tail));
453+ if (max_elts > elts_n)
454+ max_elts -= elts_n;
455+ /* A CQE slot must always be available. */
456+ assert((1u << txq->cqe_n) - (txq->cq_pi - txq->cq_ci));
457+ max_wqe = (1u << txq->wqe_n) - (txq->wqe_ci - txq->wqe_pi);
458+ if (unlikely(!max_wqe))
459+ return 0;
460+ do {
461+ struct rte_mbuf *buf = *(pkts++);
462+ unsigned int elts_head_next;
463+ uintptr_t addr;
464+ uint64_t naddr;
465+ unsigned int n;
466+ unsigned int do_inline = 0; /* Whether inline is possible. */
467+ uint32_t length;
468+ unsigned int segs_n = buf->nb_segs;
469+ uint32_t cs_flags = 0;
470+
471+ /*
472+ * Make sure there is enough room to store this packet and
473+ * that one ring entry remains unused.
474+ */
475+ assert(segs_n);
476+ if (max_elts - j < segs_n + 1)
477+ break;
478+ /* Do not bother with large packets MPW cannot handle. */
479+ if (segs_n > MLX5_MPW_DSEG_MAX)
480+ break;
481+ /* Should we enable HW CKSUM offload. */
482+ if (buf->ol_flags &
483+ (PKT_TX_IP_CKSUM | PKT_TX_TCP_CKSUM | PKT_TX_UDP_CKSUM))
484+ cs_flags = MLX5_ETH_WQE_L3_CSUM | MLX5_ETH_WQE_L4_CSUM;
485+ /* Retrieve packet information. */
486+ length = PKT_LEN(buf);
487+ /* Start new session if:
488+ * - multi-segment packet
489+ * - no space left even for a dseg
490+ * - next packet can be inlined with a new WQE
491+ * - cs_flag differs
492+ * It can't be MLX5_MPW_STATE_OPENED as always have a single
493+ * segmented packet.
494+ */
495+ if (mpw.state == MLX5_MPW_ENHANCED_STATE_OPENED) {
496+ if ((segs_n != 1) ||
497+ (inl_pad + sizeof(struct mlx5_wqe_data_seg) >
498+ mpw_room) ||
499+ (length <= txq->inline_max_packet_sz &&
500+ inl_pad + sizeof(inl_hdr) + length >
501+ mpw_room) ||
502+ (mpw.wqe->eseg.cs_flags != cs_flags))
503+ max_wqe -= mlx5_empw_close(txq, &mpw);
504+ }
505+ if (unlikely(mpw.state == MLX5_MPW_STATE_CLOSED)) {
506+ if (unlikely(segs_n != 1)) {
507+ /* Fall back to legacy MPW.
508+ * A MPW session consumes 2 WQEs at most to
509+ * include MLX5_MPW_DSEG_MAX pointers.
510+ */
511+ if (unlikely(max_wqe < 2))
512+ break;
513+ mlx5_mpw_new(txq, &mpw, length);
514+ } else {
515+ /* In Enhanced MPW, inline as much as the budget
516+ * is allowed. The remaining space is to be
517+ * filled with dsegs. If the title WQEBB isn't
518+ * padded, it will have 2 dsegs there.
519+ */
520+ mpw_room = RTE_MIN(MLX5_WQE_SIZE_MAX,
521+ (max_inline ? max_inline :
522+ pkts_n * MLX5_WQE_DWORD_SIZE) +
523+ MLX5_WQE_SIZE);
524+ if (unlikely(max_wqe * MLX5_WQE_SIZE <
525+ mpw_room))
526+ break;
527+ /* Don't pad the title WQEBB to not waste WQ. */
528+ mlx5_empw_new(txq, &mpw, 0);
529+ mpw_room -= mpw.total_len;
530+ inl_pad = 0;
531+ do_inline =
532+ length <= txq->inline_max_packet_sz &&
533+ sizeof(inl_hdr) + length <= mpw_room &&
534+ !txq->mpw_hdr_dseg;
535+ }
536+ mpw.wqe->eseg.cs_flags = cs_flags;
537+ } else {
538+ /* Evaluate whether the next packet can be inlined.
539+ * Inlininig is possible when:
540+ * - length is less than configured value
541+ * - length fits for remaining space
542+ * - not required to fill the title WQEBB with dsegs
543+ */
544+ do_inline =
545+ length <= txq->inline_max_packet_sz &&
546+ inl_pad + sizeof(inl_hdr) + length <=
547+ mpw_room &&
548+ (!txq->mpw_hdr_dseg ||
549+ mpw.total_len >= MLX5_WQE_SIZE);
550+ }
551+ /* Multi-segment packets must be alone in their MPW. */
552+ assert((segs_n == 1) || (mpw.pkts_n == 0));
553+ if (unlikely(mpw.state == MLX5_MPW_STATE_OPENED)) {
554+#if defined(MLX5_PMD_SOFT_COUNTERS) || !defined(NDEBUG)
555+ length = 0;
556+#endif
557+ do {
558+ volatile struct mlx5_wqe_data_seg *dseg;
559+
560+ elts_head_next =
561+ (elts_head + 1) & (elts_n - 1);
562+ assert(buf);
563+ (*txq->elts)[elts_head] = buf;
564+ dseg = mpw.data.dseg[mpw.pkts_n];
565+ addr = rte_pktmbuf_mtod(buf, uintptr_t);
566+ *dseg = (struct mlx5_wqe_data_seg){
567+ .byte_count = htonl(DATA_LEN(buf)),
568+ .lkey = txq_mp2mr(txq, txq_mb2mp(buf)),
569+ .addr = htonll(addr),
570+ };
571+ elts_head = elts_head_next;
572+#if defined(MLX5_PMD_SOFT_COUNTERS) || !defined(NDEBUG)
573+ length += DATA_LEN(buf);
574+#endif
575+ buf = buf->next;
576+ ++j;
577+ ++mpw.pkts_n;
578+ } while (--segs_n);
579+ /* A multi-segmented packet takes one MPW session.
580+ * TODO: Pack more multi-segmented packets if possible.
581+ */
582+ mlx5_mpw_close(txq, &mpw);
583+ if (mpw.pkts_n < 3)
584+ max_wqe--;
585+ else
586+ max_wqe -= 2;
587+ } else if (do_inline) {
588+ /* Inline packet into WQE. */
589+ unsigned int max;
590+
591+ assert(mpw.state == MLX5_MPW_ENHANCED_STATE_OPENED);
592+ assert(length == DATA_LEN(buf));
593+ inl_hdr = htonl(length | MLX5_INLINE_SEG);
594+ addr = rte_pktmbuf_mtod(buf, uintptr_t);
595+ mpw.data.raw = (volatile void *)
596+ ((uintptr_t)mpw.data.raw + inl_pad);
597+ max = tx_mlx5_wq_tailroom(txq,
598+ (void *)(uintptr_t)mpw.data.raw);
599+ /* Copy inline header. */
600+ mpw.data.raw = (volatile void *)
601+ mlx5_copy_to_wq(
602+ (void *)(uintptr_t)mpw.data.raw,
603+ &inl_hdr,
604+ sizeof(inl_hdr),
605+ (void *)(uintptr_t)txq->wqes,
606+ max);
607+ max = tx_mlx5_wq_tailroom(txq,
608+ (void *)(uintptr_t)mpw.data.raw);
609+ /* Copy packet data. */
610+ mpw.data.raw = (volatile void *)
611+ mlx5_copy_to_wq(
612+ (void *)(uintptr_t)mpw.data.raw,
613+ (void *)addr,
614+ length,
615+ (void *)(uintptr_t)txq->wqes,
616+ max);
617+ ++mpw.pkts_n;
618+ mpw.total_len += (inl_pad + sizeof(inl_hdr) + length);
619+ /* No need to get completion as the entire packet is
620+ * copied to WQ. Free the buf right away.
621+ */
622+ elts_head_next = elts_head;
623+ rte_pktmbuf_free_seg(buf);
624+ mpw_room -= (inl_pad + sizeof(inl_hdr) + length);
625+ /* Add pad in the next packet if any. */
626+ inl_pad = (((uintptr_t)mpw.data.raw +
627+ (MLX5_WQE_DWORD_SIZE - 1)) &
628+ ~(MLX5_WQE_DWORD_SIZE - 1)) -
629+ (uintptr_t)mpw.data.raw;
630+ } else {
631+ /* No inline. Load a dseg of packet pointer. */
632+ volatile rte_v128u32_t *dseg;
633+
634+ assert(mpw.state == MLX5_MPW_ENHANCED_STATE_OPENED);
635+ assert((inl_pad + sizeof(*dseg)) <= mpw_room);
636+ assert(length == DATA_LEN(buf));
637+ if (!tx_mlx5_wq_tailroom(txq,
638+ (void *)((uintptr_t)mpw.data.raw
639+ + inl_pad)))
640+ dseg = (volatile void *)txq->wqes;
641+ else
642+ dseg = (volatile void *)
643+ ((uintptr_t)mpw.data.raw +
644+ inl_pad);
645+ elts_head_next = (elts_head + 1) & (elts_n - 1);
646+ (*txq->elts)[elts_head] = buf;
647+ addr = rte_pktmbuf_mtod(buf, uintptr_t);
648+ for (n = 0; n * RTE_CACHE_LINE_SIZE < length; n++)
649+ rte_prefetch2((void *)(addr +
650+ n * RTE_CACHE_LINE_SIZE));
651+ naddr = htonll(addr);
652+ *dseg = (rte_v128u32_t) {
653+ htonl(length),
654+ txq_mp2mr(txq, txq_mb2mp(buf)),
655+ naddr,
656+ naddr >> 32,
657+ };
658+ mpw.data.raw = (volatile void *)(dseg + 1);
659+ mpw.total_len += (inl_pad + sizeof(*dseg));
660+ ++j;
661+ ++mpw.pkts_n;
662+ mpw_room -= (inl_pad + sizeof(*dseg));
663+ inl_pad = 0;
664+ }
665+ elts_head = elts_head_next;
666+#ifdef MLX5_PMD_SOFT_COUNTERS
667+ /* Increment sent bytes counter. */
668+ txq->stats.obytes += length;
669+#endif
670+ ++i;
671+ } while (i < pkts_n);
672+ /* Take a shortcut if nothing must be sent. */
673+ if (unlikely(i == 0))
674+ return 0;
675+ /* Check whether completion threshold has been reached. */
676+ if (txq->elts_comp + j >= MLX5_TX_COMP_THRESH ||
677+ (uint16_t)(txq->wqe_ci - txq->mpw_comp) >=
678+ (1 << txq->wqe_n) / MLX5_TX_COMP_THRESH_INLINE_DIV) {
679+ volatile struct mlx5_wqe *wqe = mpw.wqe;
680+
681+ /* Request completion on last WQE. */
682+ wqe->ctrl[2] = htonl(8);
683+ /* Save elts_head in unused "immediate" field of WQE. */
684+ wqe->ctrl[3] = elts_head;
685+ txq->elts_comp = 0;
686+ txq->mpw_comp = txq->wqe_ci;
687+ txq->cq_pi++;
688+ } else {
689+ txq->elts_comp += j;
690+ }
691+#ifdef MLX5_PMD_SOFT_COUNTERS
692+ /* Increment sent packets counter. */
693+ txq->stats.opackets += i;
694+#endif
695+ if (mpw.state == MLX5_MPW_ENHANCED_STATE_OPENED)
696+ mlx5_empw_close(txq, &mpw);
697+ else if (mpw.state == MLX5_MPW_STATE_OPENED)
698+ mlx5_mpw_close(txq, &mpw);
699+ /* Ring QP doorbell. */
700+ mlx5_tx_dbrec(txq, mpw.wqe);
701+ txq->elts_head = elts_head;
702+ return i;
703+}
704+
705+/**
706 * Translate RX completion flags to packet type.
707 *
708 * @param[in] cqe
709diff --git a/drivers/net/mlx5/mlx5_rxtx.h b/drivers/net/mlx5/mlx5_rxtx.h
710index 0db810c..4a4bd84 100644
711--- a/drivers/net/mlx5/mlx5_rxtx.h
712+++ b/drivers/net/mlx5/mlx5_rxtx.h
713@@ -248,17 +248,21 @@ struct txq {
714 uint16_t elts_head; /* Current index in (*elts)[]. */
715 uint16_t elts_tail; /* First element awaiting completion. */
716 uint16_t elts_comp; /* Counter since last completion request. */
717+ uint16_t mpw_comp; /* WQ index since last completion request. */
718 uint16_t cq_ci; /* Consumer index for completion queue. */
719+ uint16_t cq_pi; /* Producer index for completion queue. */
720 uint16_t wqe_ci; /* Consumer index for work queue. */
721 uint16_t wqe_pi; /* Producer index for work queue. */
722 uint16_t elts_n:4; /* (*elts)[] length (in log2). */
723 uint16_t cqe_n:4; /* Number of CQ elements (in log2). */
724 uint16_t wqe_n:4; /* Number of of WQ elements (in log2). */
725- uint16_t max_inline; /* Multiple of RTE_CACHE_LINE_SIZE to inline. */
726 uint16_t inline_en:1; /* When set inline is enabled. */
727 uint16_t tso_en:1; /* When set hardware TSO is enabled. */
728 uint16_t tunnel_en:1;
729 /* When set TX offload for tunneled packets are supported. */
730+ uint16_t mpw_hdr_dseg:1; /* Enable DSEGs in the title WQEBB. */
731+ uint16_t max_inline; /* Multiple of RTE_CACHE_LINE_SIZE to inline. */
732+ uint16_t inline_max_packet_sz; /* Max packet size for inlining. */
733 uint32_t qp_num_8s; /* QP number shifted by 8. */
734 volatile struct mlx5_cqe (*cqes)[]; /* Completion queue. */
735 volatile void *wqes; /* Work queue (use volatile to write into). */
736@@ -329,6 +333,7 @@ uint16_t mlx5_tx_burst_secondary_setup(void *, struct rte_mbuf **, uint16_t);
737 uint16_t mlx5_tx_burst(void *, struct rte_mbuf **, uint16_t);
738 uint16_t mlx5_tx_burst_mpw(void *, struct rte_mbuf **, uint16_t);
739 uint16_t mlx5_tx_burst_mpw_inline(void *, struct rte_mbuf **, uint16_t);
740+uint16_t mlx5_tx_burst_empw(void *, struct rte_mbuf **, uint16_t);
741 uint16_t mlx5_rx_burst(void *, struct rte_mbuf **, uint16_t);
742 uint16_t removed_tx_burst(void *, struct rte_mbuf **, uint16_t);
743 uint16_t removed_rx_burst(void *, struct rte_mbuf **, uint16_t);
744diff --git a/drivers/net/mlx5/mlx5_txq.c b/drivers/net/mlx5/mlx5_txq.c
745index 9d0c00f..bbfce75 100644
746--- a/drivers/net/mlx5/mlx5_txq.c
747+++ b/drivers/net/mlx5/mlx5_txq.c
748@@ -266,6 +266,7 @@ txq_ctrl_setup(struct rte_eth_dev *dev, struct txq_ctrl *txq_ctrl,
749 struct ibv_exp_cq_attr cq_attr;
750 } attr;
751 enum ibv_exp_query_intf_status status;
752+ unsigned int cqe_n;
753 int ret = 0;
754
755 if (mlx5_getenv_int("MLX5_ENABLE_CQE_COMPRESSION")) {
756@@ -276,6 +277,8 @@ txq_ctrl_setup(struct rte_eth_dev *dev, struct txq_ctrl *txq_ctrl,
757 (void)conf; /* Thresholds configuration (ignored). */
758 assert(desc > MLX5_TX_COMP_THRESH);
759 tmpl.txq.elts_n = log2above(desc);
760+ if (priv->mps == MLX5_MPW_ENHANCED)
761+ tmpl.txq.mpw_hdr_dseg = priv->mpw_hdr_dseg;
762 /* MRs will be registered in mp2mr[] later. */
763 attr.rd = (struct ibv_exp_res_domain_init_attr){
764 .comp_mask = (IBV_EXP_RES_DOMAIN_THREAD_MODEL |
765@@ -294,9 +297,12 @@ txq_ctrl_setup(struct rte_eth_dev *dev, struct txq_ctrl *txq_ctrl,
766 .comp_mask = IBV_EXP_CQ_INIT_ATTR_RES_DOMAIN,
767 .res_domain = tmpl.rd,
768 };
769+ cqe_n = ((desc / MLX5_TX_COMP_THRESH) - 1) ?
770+ ((desc / MLX5_TX_COMP_THRESH) - 1) : 1;
771+ if (priv->mps == MLX5_MPW_ENHANCED)
772+ cqe_n += MLX5_TX_COMP_THRESH_INLINE_DIV;
773 tmpl.cq = ibv_exp_create_cq(priv->ctx,
774- (((desc / MLX5_TX_COMP_THRESH) - 1) ?
775- ((desc / MLX5_TX_COMP_THRESH) - 1) : 1),
776+ cqe_n,
777 NULL, NULL, 0, &attr.cq);
778 if (tmpl.cq == NULL) {
779 ret = ENOMEM;
780@@ -340,9 +346,24 @@ txq_ctrl_setup(struct rte_eth_dev *dev, struct txq_ctrl *txq_ctrl,
781 tmpl.txq.max_inline =
782 ((priv->txq_inline + (RTE_CACHE_LINE_SIZE - 1)) /
783 RTE_CACHE_LINE_SIZE);
784- attr.init.cap.max_inline_data =
785- tmpl.txq.max_inline * RTE_CACHE_LINE_SIZE;
786 tmpl.txq.inline_en = 1;
787+ /* TSO and MPS can't be enabled concurrently. */
788+ assert(!priv->tso || !priv->mps);
789+ if (priv->mps == MLX5_MPW_ENHANCED) {
790+ tmpl.txq.inline_max_packet_sz =
791+ priv->inline_max_packet_sz;
792+ /* To minimize the size of data set, avoid requesting
793+ * too large WQ.
794+ */
795+ attr.init.cap.max_inline_data =
796+ ((RTE_MIN(priv->txq_inline,
797+ priv->inline_max_packet_sz) +
798+ (RTE_CACHE_LINE_SIZE - 1)) /
799+ RTE_CACHE_LINE_SIZE) * RTE_CACHE_LINE_SIZE;
800+ } else {
801+ attr.init.cap.max_inline_data =
802+ tmpl.txq.max_inline * RTE_CACHE_LINE_SIZE;
803+ }
804 }
805 if (priv->tso) {
806 uint16_t max_tso_inline = ((MLX5_MAX_TSO_HEADER +
807--
8082.7.4
809