| From 7ca5c8de65acabe4cb60960adcfa9247efdd2a5c Mon Sep 17 00:00:00 2001 |
| From: Yongseok Koh <yskoh@mellanox.com> |
| Date: Wed, 15 Mar 2017 16:55:44 -0700 |
| Subject: [PATCH] net/mlx5: add enhanced multi-packet send for ConnectX-5 |
| |
| ConnectX-5 supports enhanced version of multi-packet send (MPS). An MPS Tx |
| descriptor can carry multiple packets either by including pointers of |
| packets or by inlining packets. Inlining packet data can be helpful to |
| better utilize PCIe bandwidth. In addition, Enhanced MPS supports hybrid |
| mode - mixing inlined packets and pointers in a descriptor. This feature is |
| enabled by default if supported by HW. |
| |
| Signed-off-by: Yongseok Koh <yskoh@mellanox.com> |
| --- |
| doc/guides/nics/mlx5.rst | 31 +++- |
| drivers/net/mlx5/mlx5.c | 37 +++- |
| drivers/net/mlx5/mlx5.h | 4 +- |
| drivers/net/mlx5/mlx5_defs.h | 7 + |
| drivers/net/mlx5/mlx5_ethdev.c | 6 +- |
| drivers/net/mlx5/mlx5_prm.h | 20 ++ |
| drivers/net/mlx5/mlx5_rxtx.c | 410 +++++++++++++++++++++++++++++++++++++++++ |
| drivers/net/mlx5/mlx5_rxtx.h | 7 +- |
| drivers/net/mlx5/mlx5_txq.c | 29 ++- |
| 9 files changed, 534 insertions(+), 17 deletions(-) |
| |
| diff --git a/doc/guides/nics/mlx5.rst b/doc/guides/nics/mlx5.rst |
| index 41f3a47..0783aeb 100644 |
| --- a/doc/guides/nics/mlx5.rst |
| +++ b/doc/guides/nics/mlx5.rst |
| @@ -183,10 +183,17 @@ Run-time configuration |
| |
| - ``txq_mpw_en`` parameter [int] |
| |
| - A nonzero value enables multi-packet send. This feature allows the TX |
| - burst function to pack up to five packets in two descriptors in order to |
| - save PCI bandwidth and improve performance at the cost of a slightly |
| - higher CPU usage. |
| + A nonzero value enables multi-packet send (MPS) for ConnectX-4 Lx and |
| + enhanced multi-packet send (Enhanced MPS) for ConnectX-5. MPS allows the |
| + TX burst function to pack up multiple packets in a single descriptor |
| + session in order to save PCI bandwidth and improve performance at the |
| + cost of a slightly higher CPU usage. When ``txq_inline`` is set along |
| + with ``txq_mpw_en``, TX burst function tries to copy entire packet data |
| + on to TX descriptor instead of including pointer of packet only if there |
| + is enough room remained in the descriptor. ``txq_inline`` sets |
| + per-descriptor space for either pointers or inlined packets. In addition, |
| + Enhanced MPS supports hybrid mode - mixing inlined packets and pointers |
| + in the same descriptor. |
| |
| This option cannot be used in conjunction with ``tso`` below. When ``tso`` |
| is set, ``txq_mpw_en`` is disabled. |
| @@ -194,6 +201,22 @@ Run-time configuration |
| It is currently only supported on the ConnectX-4 Lx and ConnectX-5 |
| families of adapters. Enabled by default. |
| |
| +- ``txq_mpw_hdr_dseg_en`` parameter [int] |
| + |
| + A nonzero value enables including two pointers in the first block of TX |
| + descriptor. This can be used to lessen CPU load for memory copy. |
| + |
| + Effective only when Enhanced MPS is supported. Disabled by default. |
| + |
| +- ``txq_max_inline_len`` parameter [int] |
| + |
| + Maximum size of packet to be inlined. This limits the size of packet to |
| + be inlined. If the size of a packet is larger than configured value, the |
| + packet isn't inlined even though there's enough space remained in the |
| + descriptor. Instead, the packet is included with pointer. |
| + |
| + Effective only when Enhanced MPS is supported. The default value is 256. |
| + |
| - ``tso`` parameter [int] |
| |
| A nonzero value enables hardware TSO. |
| diff --git a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c |
| index ebc7984..bc6a34f 100644 |
| --- a/drivers/net/mlx5/mlx5.c |
| +++ b/drivers/net/mlx5/mlx5.c |
| @@ -84,6 +84,12 @@ |
| /* Device parameter to enable multi-packet send WQEs. */ |
| #define MLX5_TXQ_MPW_EN "txq_mpw_en" |
| |
| +/* Device parameter to include 2 dsegs in the title WQEBB. */ |
| +#define MLX5_TXQ_MPW_HDR_DSEG_EN "txq_mpw_hdr_dseg_en" |
| + |
| +/* Device parameter to limit the size of inlining packet. */ |
| +#define MLX5_TXQ_MAX_INLINE_LEN "txq_max_inline_len" |
| + |
| /* Device parameter to enable hardware TSO offload. */ |
| #define MLX5_TSO "tso" |
| |
| @@ -294,7 +300,11 @@ mlx5_args_check(const char *key, const char *val, void *opaque) |
| } else if (strcmp(MLX5_TXQS_MIN_INLINE, key) == 0) { |
| priv->txqs_inline = tmp; |
| } else if (strcmp(MLX5_TXQ_MPW_EN, key) == 0) { |
| - priv->mps &= !!tmp; /* Enable MPW only if HW supports */ |
| + priv->mps = !!tmp ? priv->mps : MLX5_MPW_DISABLED; |
| + } else if (strcmp(MLX5_TXQ_MPW_HDR_DSEG_EN, key) == 0) { |
| + priv->mpw_hdr_dseg = !!tmp; |
| + } else if (strcmp(MLX5_TXQ_MAX_INLINE_LEN, key) == 0) { |
| + priv->inline_max_packet_sz = tmp; |
| } else if (strcmp(MLX5_TSO, key) == 0) { |
| priv->tso = !!tmp; |
| } else { |
| @@ -323,6 +333,8 @@ mlx5_args(struct priv *priv, struct rte_devargs *devargs) |
| MLX5_TXQ_INLINE, |
| MLX5_TXQS_MIN_INLINE, |
| MLX5_TXQ_MPW_EN, |
| + MLX5_TXQ_MPW_HDR_DSEG_EN, |
| + MLX5_TXQ_MAX_INLINE_LEN, |
| MLX5_TSO, |
| NULL, |
| }; |
| @@ -434,24 +446,27 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv, struct rte_pci_device *pci_dev) |
| switch (pci_dev->id.device_id) { |
| case PCI_DEVICE_ID_MELLANOX_CONNECTX4: |
| tunnel_en = 1; |
| - mps = 0; |
| + mps = MLX5_MPW_DISABLED; |
| break; |
| case PCI_DEVICE_ID_MELLANOX_CONNECTX4LX: |
| + mps = MLX5_MPW; |
| + break; |
| case PCI_DEVICE_ID_MELLANOX_CONNECTX5: |
| case PCI_DEVICE_ID_MELLANOX_CONNECTX5VF: |
| case PCI_DEVICE_ID_MELLANOX_CONNECTX5EX: |
| case PCI_DEVICE_ID_MELLANOX_CONNECTX5EXVF: |
| - mps = 1; |
| tunnel_en = 1; |
| + mps = MLX5_MPW_ENHANCED; |
| break; |
| default: |
| - mps = 0; |
| + mps = MLX5_MPW_DISABLED; |
| } |
| INFO("PCI information matches, using device \"%s\"" |
| - " (SR-IOV: %s, MPS: %s)", |
| + " (SR-IOV: %s, %sMPS: %s)", |
| list[i]->name, |
| sriov ? "true" : "false", |
| - mps ? "true" : "false"); |
| + mps == MLX5_MPW_ENHANCED ? "Enhanced " : "", |
| + mps != MLX5_MPW_DISABLED ? "true" : "false"); |
| attr_ctx = ibv_open_device(list[i]); |
| err = errno; |
| break; |
| @@ -546,6 +561,13 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv, struct rte_pci_device *pci_dev) |
| priv->pd = pd; |
| priv->mtu = ETHER_MTU; |
| priv->mps = mps; /* Enable MPW by default if supported. */ |
| + /* Set default values for Enhanced MPW, a.k.a MPWv2. */ |
| + if (mps == MLX5_MPW_ENHANCED) { |
| + priv->mpw_hdr_dseg = 0; |
| + priv->txqs_inline = MLX5_EMPW_MIN_TXQS; |
| + priv->inline_max_packet_sz = MLX5_EMPW_MAX_INLINE_LEN; |
| + priv->txq_inline = MLX5_WQE_SIZE_MAX - MLX5_WQE_SIZE; |
| + } |
| priv->cqe_comp = 1; /* Enable compression by default. */ |
| priv->tunnel_en = tunnel_en; |
| err = mlx5_args(priv, pci_dev->device.devargs); |
| @@ -613,6 +635,9 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv, struct rte_pci_device *pci_dev) |
| "with TSO. MPS disabled"); |
| priv->mps = 0; |
| } |
| + INFO("%sMPS is %s", |
| + priv->mps == MLX5_MPW_ENHANCED ? "Enhanced " : "", |
| + priv->mps != MLX5_MPW_DISABLED ? "enabled" : "disabled"); |
| /* Allocate and register default RSS hash keys. */ |
| priv->rss_conf = rte_calloc(__func__, hash_rxq_init_n, |
| sizeof((*priv->rss_conf)[0]), 0); |
| diff --git a/drivers/net/mlx5/mlx5.h b/drivers/net/mlx5/mlx5.h |
| index 870e01f..d26d465 100644 |
| --- a/drivers/net/mlx5/mlx5.h |
| +++ b/drivers/net/mlx5/mlx5.h |
| @@ -123,7 +123,8 @@ struct priv { |
| unsigned int hw_fcs_strip:1; /* FCS stripping is supported. */ |
| unsigned int hw_padding:1; /* End alignment padding is supported. */ |
| unsigned int sriov:1; /* This is a VF or PF with VF devices. */ |
| - unsigned int mps:1; /* Whether multi-packet send is supported. */ |
| + unsigned int mps:2; /* Multi-packet send mode (0: disabled). */ |
| + unsigned int mpw_hdr_dseg:1; /* Enable DSEGs in the title WQEBB. */ |
| unsigned int cqe_comp:1; /* Whether CQE compression is enabled. */ |
| unsigned int pending_alarm:1; /* An alarm is pending. */ |
| unsigned int tso:1; /* Whether TSO is supported. */ |
| @@ -132,6 +133,7 @@ struct priv { |
| unsigned int max_tso_payload_sz; /* Maximum TCP payload for TSO. */ |
| unsigned int txq_inline; /* Maximum packet size for inlining. */ |
| unsigned int txqs_inline; /* Queue number threshold for inlining. */ |
| + unsigned int inline_max_packet_sz; /* Max packet size for inlining. */ |
| /* RX/TX queues. */ |
| unsigned int rxqs_n; /* RX queues array size. */ |
| unsigned int txqs_n; /* TX queues array size. */ |
| diff --git a/drivers/net/mlx5/mlx5_defs.h b/drivers/net/mlx5/mlx5_defs.h |
| index eecb908..201bb33 100644 |
| --- a/drivers/net/mlx5/mlx5_defs.h |
| +++ b/drivers/net/mlx5/mlx5_defs.h |
| @@ -55,6 +55,13 @@ |
| #define MLX5_TX_COMP_THRESH 32 |
| |
| /* |
| + * Request TX completion every time the total number of WQEBBs used for inlining |
| + * packets exceeds the size of WQ divided by this divisor. Better to be power of |
| + * two for performance. |
| + */ |
| +#define MLX5_TX_COMP_THRESH_INLINE_DIV (1 << 3) |
| + |
| +/* |
| * Maximum number of cached Memory Pools (MPs) per TX queue. Each RTE MP |
| * from which buffers are to be transmitted will have to be mapped by this |
| * driver to their own Memory Region (MR). This is a slow operation. |
| diff --git a/drivers/net/mlx5/mlx5_ethdev.c b/drivers/net/mlx5/mlx5_ethdev.c |
| index 5deb6e8..dd5fe5c 100644 |
| --- a/drivers/net/mlx5/mlx5_ethdev.c |
| +++ b/drivers/net/mlx5/mlx5_ethdev.c |
| @@ -1590,7 +1590,11 @@ priv_select_tx_function(struct priv *priv) |
| { |
| priv->dev->tx_pkt_burst = mlx5_tx_burst; |
| /* Select appropriate TX function. */ |
| - if (priv->mps && priv->txq_inline) { |
| + if (priv->mps == MLX5_MPW_ENHANCED) { |
| + priv->dev->tx_pkt_burst = |
| + mlx5_tx_burst_empw; |
| + DEBUG("selected Enhanced MPW TX function"); |
| + } else if (priv->mps && priv->txq_inline) { |
| priv->dev->tx_pkt_burst = mlx5_tx_burst_mpw_inline; |
| DEBUG("selected MPW inline TX function"); |
| } else if (priv->mps) { |
| diff --git a/drivers/net/mlx5/mlx5_prm.h b/drivers/net/mlx5/mlx5_prm.h |
| index 0a77f5b..155bdba 100644 |
| --- a/drivers/net/mlx5/mlx5_prm.h |
| +++ b/drivers/net/mlx5/mlx5_prm.h |
| @@ -73,6 +73,9 @@ |
| /* WQE size */ |
| #define MLX5_WQE_SIZE (4 * MLX5_WQE_DWORD_SIZE) |
| |
| +/* Max size of a WQE session. */ |
| +#define MLX5_WQE_SIZE_MAX 960U |
| + |
| /* Compute the number of DS. */ |
| #define MLX5_WQE_DS(n) \ |
| (((n) + MLX5_WQE_DWORD_SIZE - 1) / MLX5_WQE_DWORD_SIZE) |
| @@ -80,10 +83,19 @@ |
| /* Room for inline data in multi-packet WQE. */ |
| #define MLX5_MWQE64_INL_DATA 28 |
| |
| +/* Default minimum number of Tx queues for inlining packets. */ |
| +#define MLX5_EMPW_MIN_TXQS 8 |
| + |
| +/* Default max packet length to be inlined. */ |
| +#define MLX5_EMPW_MAX_INLINE_LEN (4U * MLX5_WQE_SIZE) |
| + |
| #ifndef HAVE_VERBS_MLX5_OPCODE_TSO |
| #define MLX5_OPCODE_TSO MLX5_OPCODE_LSO_MPW /* Compat with OFED 3.3. */ |
| #endif |
| |
| +#define MLX5_OPC_MOD_ENHANCED_MPSW 0 |
| +#define MLX5_OPCODE_ENHANCED_MPSW 0x29 |
| + |
| /* CQE value to inform that VLAN is stripped. */ |
| #define MLX5_CQE_VLAN_STRIPPED (1u << 0) |
| |
| @@ -176,10 +188,18 @@ struct mlx5_wqe64 { |
| uint8_t raw[32]; |
| } __rte_aligned(MLX5_WQE_SIZE); |
| |
| +/* MPW mode. */ |
| +enum mlx5_mpw_mode { |
| + MLX5_MPW_DISABLED, |
| + MLX5_MPW, |
| + MLX5_MPW_ENHANCED, /* Enhanced Multi-Packet Send WQE, a.k.a MPWv2. */ |
| +}; |
| + |
| /* MPW session status. */ |
| enum mlx5_mpw_state { |
| MLX5_MPW_STATE_OPENED, |
| MLX5_MPW_INL_STATE_OPENED, |
| + MLX5_MPW_ENHANCED_STATE_OPENED, |
| MLX5_MPW_STATE_CLOSED, |
| }; |
| |
| diff --git a/drivers/net/mlx5/mlx5_rxtx.c b/drivers/net/mlx5/mlx5_rxtx.c |
| index 9fc433e..a1dd84a 100644 |
| --- a/drivers/net/mlx5/mlx5_rxtx.c |
| +++ b/drivers/net/mlx5/mlx5_rxtx.c |
| @@ -195,6 +195,62 @@ tx_mlx5_wqe(struct txq *txq, uint16_t ci) |
| } |
| |
| /** |
| + * Return the size of tailroom of WQ. |
| + * |
| + * @param txq |
| + * Pointer to TX queue structure. |
| + * @param addr |
| + * Pointer to tail of WQ. |
| + * |
| + * @return |
| + * Size of tailroom. |
| + */ |
| +static inline size_t |
| +tx_mlx5_wq_tailroom(struct txq *txq, void *addr) |
| +{ |
| + size_t tailroom; |
| + tailroom = (uintptr_t)(txq->wqes) + |
| + (1 << txq->wqe_n) * MLX5_WQE_SIZE - |
| + (uintptr_t)addr; |
| + return tailroom; |
| +} |
| + |
| +/** |
| + * Copy data to tailroom of circular queue. |
| + * |
| + * @param dst |
| + * Pointer to destination. |
| + * @param src |
| + * Pointer to source. |
| + * @param n |
| + * Number of bytes to copy. |
| + * @param base |
| + * Pointer to head of queue. |
| + * @param tailroom |
| + * Size of tailroom from dst. |
| + * |
| + * @return |
| + * Pointer after copied data. |
| + */ |
| +static inline void * |
| +mlx5_copy_to_wq(void *dst, const void *src, size_t n, |
| + void *base, size_t tailroom) |
| +{ |
| + void *ret; |
| + |
| + if (n > tailroom) { |
| + rte_memcpy(dst, src, tailroom); |
| + rte_memcpy(base, (void *)((uintptr_t)src + tailroom), |
| + n - tailroom); |
| + ret = (uint8_t *)base + n - tailroom; |
| + } else { |
| + rte_memcpy(dst, src, n); |
| + ret = (n == tailroom) ? base : (uint8_t *)dst + n; |
| + } |
| + return ret; |
| +} |
| + |
| +/** |
| * Manage TX completions. |
| * |
| * When sending a burst, mlx5_tx_burst() posts several WRs. |
| @@ -1269,6 +1325,360 @@ mlx5_tx_burst_mpw_inline(void *dpdk_txq, struct rte_mbuf **pkts, |
| } |
| |
| /** |
| + * Open an Enhanced MPW session. |
| + * |
| + * @param txq |
| + * Pointer to TX queue structure. |
| + * @param mpw |
| + * Pointer to MPW session structure. |
| + * @param length |
| + * Packet length. |
| + */ |
| +static inline void |
| +mlx5_empw_new(struct txq *txq, struct mlx5_mpw *mpw, int padding) |
| +{ |
| + uint16_t idx = txq->wqe_ci & ((1 << txq->wqe_n) - 1); |
| + |
| + mpw->state = MLX5_MPW_ENHANCED_STATE_OPENED; |
| + mpw->pkts_n = 0; |
| + mpw->total_len = sizeof(struct mlx5_wqe); |
| + mpw->wqe = (volatile struct mlx5_wqe *)tx_mlx5_wqe(txq, idx); |
| + mpw->wqe->ctrl[0] = htonl((MLX5_OPC_MOD_ENHANCED_MPSW << 24) | |
| + (txq->wqe_ci << 8) | |
| + MLX5_OPCODE_ENHANCED_MPSW); |
| + mpw->wqe->ctrl[2] = 0; |
| + mpw->wqe->ctrl[3] = 0; |
| + memset((void *)(uintptr_t)&mpw->wqe->eseg, 0, MLX5_WQE_DWORD_SIZE); |
| + if (unlikely(padding)) { |
| + uintptr_t addr = (uintptr_t)(mpw->wqe + 1); |
| + |
| + /* Pad the first 2 DWORDs with zero-length inline header. */ |
| + *(volatile uint32_t *)addr = htonl(MLX5_INLINE_SEG); |
| + *(volatile uint32_t *)(addr + MLX5_WQE_DWORD_SIZE) = |
| + htonl(MLX5_INLINE_SEG); |
| + mpw->total_len += 2 * MLX5_WQE_DWORD_SIZE; |
| + /* Start from the next WQEBB. */ |
| + mpw->data.raw = (volatile void *)(tx_mlx5_wqe(txq, idx + 1)); |
| + } else { |
| + mpw->data.raw = (volatile void *)(mpw->wqe + 1); |
| + } |
| +} |
| + |
| +/** |
| + * Close an Enhanced MPW session. |
| + * |
| + * @param txq |
| + * Pointer to TX queue structure. |
| + * @param mpw |
| + * Pointer to MPW session structure. |
| + * |
| + * @return |
| + * Number of consumed WQEs. |
| + */ |
| +static inline uint16_t |
| +mlx5_empw_close(struct txq *txq, struct mlx5_mpw *mpw) |
| +{ |
| + uint16_t ret; |
| + |
| + /* Store size in multiple of 16 bytes. Control and Ethernet segments |
| + * count as 2. |
| + */ |
| + mpw->wqe->ctrl[1] = htonl(txq->qp_num_8s | MLX5_WQE_DS(mpw->total_len)); |
| + mpw->state = MLX5_MPW_STATE_CLOSED; |
| + ret = (mpw->total_len + (MLX5_WQE_SIZE - 1)) / MLX5_WQE_SIZE; |
| + txq->wqe_ci += ret; |
| + return ret; |
| +} |
| + |
| +/** |
| + * DPDK callback for TX with Enhanced MPW support. |
| + * |
| + * @param dpdk_txq |
| + * Generic pointer to TX queue structure. |
| + * @param[in] pkts |
| + * Packets to transmit. |
| + * @param pkts_n |
| + * Number of packets in array. |
| + * |
| + * @return |
| + * Number of packets successfully transmitted (<= pkts_n). |
| + */ |
| +uint16_t |
| +mlx5_tx_burst_empw(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n) |
| +{ |
| + struct txq *txq = (struct txq *)dpdk_txq; |
| + uint16_t elts_head = txq->elts_head; |
| + const unsigned int elts_n = 1 << txq->elts_n; |
| + unsigned int i = 0; |
| + unsigned int j = 0; |
| + unsigned int max_elts; |
| + uint16_t max_wqe; |
| + unsigned int max_inline = txq->max_inline * RTE_CACHE_LINE_SIZE; |
| + unsigned int mpw_room = 0; |
| + unsigned int inl_pad = 0; |
| + uint32_t inl_hdr; |
| + struct mlx5_mpw mpw = { |
| + .state = MLX5_MPW_STATE_CLOSED, |
| + }; |
| + |
| + if (unlikely(!pkts_n)) |
| + return 0; |
| + /* Start processing. */ |
| + txq_complete(txq); |
| + max_elts = (elts_n - (elts_head - txq->elts_tail)); |
| + if (max_elts > elts_n) |
| + max_elts -= elts_n; |
| + /* A CQE slot must always be available. */ |
| + assert((1u << txq->cqe_n) - (txq->cq_pi - txq->cq_ci)); |
| + max_wqe = (1u << txq->wqe_n) - (txq->wqe_ci - txq->wqe_pi); |
| + if (unlikely(!max_wqe)) |
| + return 0; |
| + do { |
| + struct rte_mbuf *buf = *(pkts++); |
| + unsigned int elts_head_next; |
| + uintptr_t addr; |
| + uint64_t naddr; |
| + unsigned int n; |
| + unsigned int do_inline = 0; /* Whether inline is possible. */ |
| + uint32_t length; |
| + unsigned int segs_n = buf->nb_segs; |
| + uint32_t cs_flags = 0; |
| + |
| + /* |
| + * Make sure there is enough room to store this packet and |
| + * that one ring entry remains unused. |
| + */ |
| + assert(segs_n); |
| + if (max_elts - j < segs_n + 1) |
| + break; |
| + /* Do not bother with large packets MPW cannot handle. */ |
| + if (segs_n > MLX5_MPW_DSEG_MAX) |
| + break; |
| + /* Should we enable HW CKSUM offload. */ |
| + if (buf->ol_flags & |
| + (PKT_TX_IP_CKSUM | PKT_TX_TCP_CKSUM | PKT_TX_UDP_CKSUM)) |
| + cs_flags = MLX5_ETH_WQE_L3_CSUM | MLX5_ETH_WQE_L4_CSUM; |
| + /* Retrieve packet information. */ |
| + length = PKT_LEN(buf); |
| + /* Start new session if: |
| + * - multi-segment packet |
| + * - no space left even for a dseg |
| + * - next packet can be inlined with a new WQE |
| + * - cs_flag differs |
| + * It can't be MLX5_MPW_STATE_OPENED as always have a single |
| + * segmented packet. |
| + */ |
| + if (mpw.state == MLX5_MPW_ENHANCED_STATE_OPENED) { |
| + if ((segs_n != 1) || |
| + (inl_pad + sizeof(struct mlx5_wqe_data_seg) > |
| + mpw_room) || |
| + (length <= txq->inline_max_packet_sz && |
| + inl_pad + sizeof(inl_hdr) + length > |
| + mpw_room) || |
| + (mpw.wqe->eseg.cs_flags != cs_flags)) |
| + max_wqe -= mlx5_empw_close(txq, &mpw); |
| + } |
| + if (unlikely(mpw.state == MLX5_MPW_STATE_CLOSED)) { |
| + if (unlikely(segs_n != 1)) { |
| + /* Fall back to legacy MPW. |
| + * A MPW session consumes 2 WQEs at most to |
| + * include MLX5_MPW_DSEG_MAX pointers. |
| + */ |
| + if (unlikely(max_wqe < 2)) |
| + break; |
| + mlx5_mpw_new(txq, &mpw, length); |
| + } else { |
| + /* In Enhanced MPW, inline as much as the budget |
| + * is allowed. The remaining space is to be |
| + * filled with dsegs. If the title WQEBB isn't |
| + * padded, it will have 2 dsegs there. |
| + */ |
| + mpw_room = RTE_MIN(MLX5_WQE_SIZE_MAX, |
| + (max_inline ? max_inline : |
| + pkts_n * MLX5_WQE_DWORD_SIZE) + |
| + MLX5_WQE_SIZE); |
| + if (unlikely(max_wqe * MLX5_WQE_SIZE < |
| + mpw_room)) |
| + break; |
| + /* Don't pad the title WQEBB to not waste WQ. */ |
| + mlx5_empw_new(txq, &mpw, 0); |
| + mpw_room -= mpw.total_len; |
| + inl_pad = 0; |
| + do_inline = |
| + length <= txq->inline_max_packet_sz && |
| + sizeof(inl_hdr) + length <= mpw_room && |
| + !txq->mpw_hdr_dseg; |
| + } |
| + mpw.wqe->eseg.cs_flags = cs_flags; |
| + } else { |
| + /* Evaluate whether the next packet can be inlined. |
| + * Inlininig is possible when: |
| + * - length is less than configured value |
| + * - length fits for remaining space |
| + * - not required to fill the title WQEBB with dsegs |
| + */ |
| + do_inline = |
| + length <= txq->inline_max_packet_sz && |
| + inl_pad + sizeof(inl_hdr) + length <= |
| + mpw_room && |
| + (!txq->mpw_hdr_dseg || |
| + mpw.total_len >= MLX5_WQE_SIZE); |
| + } |
| + /* Multi-segment packets must be alone in their MPW. */ |
| + assert((segs_n == 1) || (mpw.pkts_n == 0)); |
| + if (unlikely(mpw.state == MLX5_MPW_STATE_OPENED)) { |
| +#if defined(MLX5_PMD_SOFT_COUNTERS) || !defined(NDEBUG) |
| + length = 0; |
| +#endif |
| + do { |
| + volatile struct mlx5_wqe_data_seg *dseg; |
| + |
| + elts_head_next = |
| + (elts_head + 1) & (elts_n - 1); |
| + assert(buf); |
| + (*txq->elts)[elts_head] = buf; |
| + dseg = mpw.data.dseg[mpw.pkts_n]; |
| + addr = rte_pktmbuf_mtod(buf, uintptr_t); |
| + *dseg = (struct mlx5_wqe_data_seg){ |
| + .byte_count = htonl(DATA_LEN(buf)), |
| + .lkey = txq_mp2mr(txq, txq_mb2mp(buf)), |
| + .addr = htonll(addr), |
| + }; |
| + elts_head = elts_head_next; |
| +#if defined(MLX5_PMD_SOFT_COUNTERS) || !defined(NDEBUG) |
| + length += DATA_LEN(buf); |
| +#endif |
| + buf = buf->next; |
| + ++j; |
| + ++mpw.pkts_n; |
| + } while (--segs_n); |
| + /* A multi-segmented packet takes one MPW session. |
| + * TODO: Pack more multi-segmented packets if possible. |
| + */ |
| + mlx5_mpw_close(txq, &mpw); |
| + if (mpw.pkts_n < 3) |
| + max_wqe--; |
| + else |
| + max_wqe -= 2; |
| + } else if (do_inline) { |
| + /* Inline packet into WQE. */ |
| + unsigned int max; |
| + |
| + assert(mpw.state == MLX5_MPW_ENHANCED_STATE_OPENED); |
| + assert(length == DATA_LEN(buf)); |
| + inl_hdr = htonl(length | MLX5_INLINE_SEG); |
| + addr = rte_pktmbuf_mtod(buf, uintptr_t); |
| + mpw.data.raw = (volatile void *) |
| + ((uintptr_t)mpw.data.raw + inl_pad); |
| + max = tx_mlx5_wq_tailroom(txq, |
| + (void *)(uintptr_t)mpw.data.raw); |
| + /* Copy inline header. */ |
| + mpw.data.raw = (volatile void *) |
| + mlx5_copy_to_wq( |
| + (void *)(uintptr_t)mpw.data.raw, |
| + &inl_hdr, |
| + sizeof(inl_hdr), |
| + (void *)(uintptr_t)txq->wqes, |
| + max); |
| + max = tx_mlx5_wq_tailroom(txq, |
| + (void *)(uintptr_t)mpw.data.raw); |
| + /* Copy packet data. */ |
| + mpw.data.raw = (volatile void *) |
| + mlx5_copy_to_wq( |
| + (void *)(uintptr_t)mpw.data.raw, |
| + (void *)addr, |
| + length, |
| + (void *)(uintptr_t)txq->wqes, |
| + max); |
| + ++mpw.pkts_n; |
| + mpw.total_len += (inl_pad + sizeof(inl_hdr) + length); |
| + /* No need to get completion as the entire packet is |
| + * copied to WQ. Free the buf right away. |
| + */ |
| + elts_head_next = elts_head; |
| + rte_pktmbuf_free_seg(buf); |
| + mpw_room -= (inl_pad + sizeof(inl_hdr) + length); |
| + /* Add pad in the next packet if any. */ |
| + inl_pad = (((uintptr_t)mpw.data.raw + |
| + (MLX5_WQE_DWORD_SIZE - 1)) & |
| + ~(MLX5_WQE_DWORD_SIZE - 1)) - |
| + (uintptr_t)mpw.data.raw; |
| + } else { |
| + /* No inline. Load a dseg of packet pointer. */ |
| + volatile rte_v128u32_t *dseg; |
| + |
| + assert(mpw.state == MLX5_MPW_ENHANCED_STATE_OPENED); |
| + assert((inl_pad + sizeof(*dseg)) <= mpw_room); |
| + assert(length == DATA_LEN(buf)); |
| + if (!tx_mlx5_wq_tailroom(txq, |
| + (void *)((uintptr_t)mpw.data.raw |
| + + inl_pad))) |
| + dseg = (volatile void *)txq->wqes; |
| + else |
| + dseg = (volatile void *) |
| + ((uintptr_t)mpw.data.raw + |
| + inl_pad); |
| + elts_head_next = (elts_head + 1) & (elts_n - 1); |
| + (*txq->elts)[elts_head] = buf; |
| + addr = rte_pktmbuf_mtod(buf, uintptr_t); |
| + for (n = 0; n * RTE_CACHE_LINE_SIZE < length; n++) |
| + rte_prefetch2((void *)(addr + |
| + n * RTE_CACHE_LINE_SIZE)); |
| + naddr = htonll(addr); |
| + *dseg = (rte_v128u32_t) { |
| + htonl(length), |
| + txq_mp2mr(txq, txq_mb2mp(buf)), |
| + naddr, |
| + naddr >> 32, |
| + }; |
| + mpw.data.raw = (volatile void *)(dseg + 1); |
| + mpw.total_len += (inl_pad + sizeof(*dseg)); |
| + ++j; |
| + ++mpw.pkts_n; |
| + mpw_room -= (inl_pad + sizeof(*dseg)); |
| + inl_pad = 0; |
| + } |
| + elts_head = elts_head_next; |
| +#ifdef MLX5_PMD_SOFT_COUNTERS |
| + /* Increment sent bytes counter. */ |
| + txq->stats.obytes += length; |
| +#endif |
| + ++i; |
| + } while (i < pkts_n); |
| + /* Take a shortcut if nothing must be sent. */ |
| + if (unlikely(i == 0)) |
| + return 0; |
| + /* Check whether completion threshold has been reached. */ |
| + if (txq->elts_comp + j >= MLX5_TX_COMP_THRESH || |
| + (uint16_t)(txq->wqe_ci - txq->mpw_comp) >= |
| + (1 << txq->wqe_n) / MLX5_TX_COMP_THRESH_INLINE_DIV) { |
| + volatile struct mlx5_wqe *wqe = mpw.wqe; |
| + |
| + /* Request completion on last WQE. */ |
| + wqe->ctrl[2] = htonl(8); |
| + /* Save elts_head in unused "immediate" field of WQE. */ |
| + wqe->ctrl[3] = elts_head; |
| + txq->elts_comp = 0; |
| + txq->mpw_comp = txq->wqe_ci; |
| + txq->cq_pi++; |
| + } else { |
| + txq->elts_comp += j; |
| + } |
| +#ifdef MLX5_PMD_SOFT_COUNTERS |
| + /* Increment sent packets counter. */ |
| + txq->stats.opackets += i; |
| +#endif |
| + if (mpw.state == MLX5_MPW_ENHANCED_STATE_OPENED) |
| + mlx5_empw_close(txq, &mpw); |
| + else if (mpw.state == MLX5_MPW_STATE_OPENED) |
| + mlx5_mpw_close(txq, &mpw); |
| + /* Ring QP doorbell. */ |
| + mlx5_tx_dbrec(txq, mpw.wqe); |
| + txq->elts_head = elts_head; |
| + return i; |
| +} |
| + |
| +/** |
| * Translate RX completion flags to packet type. |
| * |
| * @param[in] cqe |
| diff --git a/drivers/net/mlx5/mlx5_rxtx.h b/drivers/net/mlx5/mlx5_rxtx.h |
| index 0db810c..4a4bd84 100644 |
| --- a/drivers/net/mlx5/mlx5_rxtx.h |
| +++ b/drivers/net/mlx5/mlx5_rxtx.h |
| @@ -248,17 +248,21 @@ struct txq { |
| uint16_t elts_head; /* Current index in (*elts)[]. */ |
| uint16_t elts_tail; /* First element awaiting completion. */ |
| uint16_t elts_comp; /* Counter since last completion request. */ |
| + uint16_t mpw_comp; /* WQ index since last completion request. */ |
| uint16_t cq_ci; /* Consumer index for completion queue. */ |
| + uint16_t cq_pi; /* Producer index for completion queue. */ |
| uint16_t wqe_ci; /* Consumer index for work queue. */ |
| uint16_t wqe_pi; /* Producer index for work queue. */ |
| uint16_t elts_n:4; /* (*elts)[] length (in log2). */ |
| uint16_t cqe_n:4; /* Number of CQ elements (in log2). */ |
| uint16_t wqe_n:4; /* Number of of WQ elements (in log2). */ |
| - uint16_t max_inline; /* Multiple of RTE_CACHE_LINE_SIZE to inline. */ |
| uint16_t inline_en:1; /* When set inline is enabled. */ |
| uint16_t tso_en:1; /* When set hardware TSO is enabled. */ |
| uint16_t tunnel_en:1; |
| /* When set TX offload for tunneled packets are supported. */ |
| + uint16_t mpw_hdr_dseg:1; /* Enable DSEGs in the title WQEBB. */ |
| + uint16_t max_inline; /* Multiple of RTE_CACHE_LINE_SIZE to inline. */ |
| + uint16_t inline_max_packet_sz; /* Max packet size for inlining. */ |
| uint32_t qp_num_8s; /* QP number shifted by 8. */ |
| volatile struct mlx5_cqe (*cqes)[]; /* Completion queue. */ |
| volatile void *wqes; /* Work queue (use volatile to write into). */ |
| @@ -329,6 +333,7 @@ uint16_t mlx5_tx_burst_secondary_setup(void *, struct rte_mbuf **, uint16_t); |
| uint16_t mlx5_tx_burst(void *, struct rte_mbuf **, uint16_t); |
| uint16_t mlx5_tx_burst_mpw(void *, struct rte_mbuf **, uint16_t); |
| uint16_t mlx5_tx_burst_mpw_inline(void *, struct rte_mbuf **, uint16_t); |
| +uint16_t mlx5_tx_burst_empw(void *, struct rte_mbuf **, uint16_t); |
| uint16_t mlx5_rx_burst(void *, struct rte_mbuf **, uint16_t); |
| uint16_t removed_tx_burst(void *, struct rte_mbuf **, uint16_t); |
| uint16_t removed_rx_burst(void *, struct rte_mbuf **, uint16_t); |
| diff --git a/drivers/net/mlx5/mlx5_txq.c b/drivers/net/mlx5/mlx5_txq.c |
| index 9d0c00f..bbfce75 100644 |
| --- a/drivers/net/mlx5/mlx5_txq.c |
| +++ b/drivers/net/mlx5/mlx5_txq.c |
| @@ -266,6 +266,7 @@ txq_ctrl_setup(struct rte_eth_dev *dev, struct txq_ctrl *txq_ctrl, |
| struct ibv_exp_cq_attr cq_attr; |
| } attr; |
| enum ibv_exp_query_intf_status status; |
| + unsigned int cqe_n; |
| int ret = 0; |
| |
| if (mlx5_getenv_int("MLX5_ENABLE_CQE_COMPRESSION")) { |
| @@ -276,6 +277,8 @@ txq_ctrl_setup(struct rte_eth_dev *dev, struct txq_ctrl *txq_ctrl, |
| (void)conf; /* Thresholds configuration (ignored). */ |
| assert(desc > MLX5_TX_COMP_THRESH); |
| tmpl.txq.elts_n = log2above(desc); |
| + if (priv->mps == MLX5_MPW_ENHANCED) |
| + tmpl.txq.mpw_hdr_dseg = priv->mpw_hdr_dseg; |
| /* MRs will be registered in mp2mr[] later. */ |
| attr.rd = (struct ibv_exp_res_domain_init_attr){ |
| .comp_mask = (IBV_EXP_RES_DOMAIN_THREAD_MODEL | |
| @@ -294,9 +297,12 @@ txq_ctrl_setup(struct rte_eth_dev *dev, struct txq_ctrl *txq_ctrl, |
| .comp_mask = IBV_EXP_CQ_INIT_ATTR_RES_DOMAIN, |
| .res_domain = tmpl.rd, |
| }; |
| + cqe_n = ((desc / MLX5_TX_COMP_THRESH) - 1) ? |
| + ((desc / MLX5_TX_COMP_THRESH) - 1) : 1; |
| + if (priv->mps == MLX5_MPW_ENHANCED) |
| + cqe_n += MLX5_TX_COMP_THRESH_INLINE_DIV; |
| tmpl.cq = ibv_exp_create_cq(priv->ctx, |
| - (((desc / MLX5_TX_COMP_THRESH) - 1) ? |
| - ((desc / MLX5_TX_COMP_THRESH) - 1) : 1), |
| + cqe_n, |
| NULL, NULL, 0, &attr.cq); |
| if (tmpl.cq == NULL) { |
| ret = ENOMEM; |
| @@ -340,9 +346,24 @@ txq_ctrl_setup(struct rte_eth_dev *dev, struct txq_ctrl *txq_ctrl, |
| tmpl.txq.max_inline = |
| ((priv->txq_inline + (RTE_CACHE_LINE_SIZE - 1)) / |
| RTE_CACHE_LINE_SIZE); |
| - attr.init.cap.max_inline_data = |
| - tmpl.txq.max_inline * RTE_CACHE_LINE_SIZE; |
| tmpl.txq.inline_en = 1; |
| + /* TSO and MPS can't be enabled concurrently. */ |
| + assert(!priv->tso || !priv->mps); |
| + if (priv->mps == MLX5_MPW_ENHANCED) { |
| + tmpl.txq.inline_max_packet_sz = |
| + priv->inline_max_packet_sz; |
| + /* To minimize the size of data set, avoid requesting |
| + * too large WQ. |
| + */ |
| + attr.init.cap.max_inline_data = |
| + ((RTE_MIN(priv->txq_inline, |
| + priv->inline_max_packet_sz) + |
| + (RTE_CACHE_LINE_SIZE - 1)) / |
| + RTE_CACHE_LINE_SIZE) * RTE_CACHE_LINE_SIZE; |
| + } else { |
| + attr.init.cap.max_inline_data = |
| + tmpl.txq.max_inline * RTE_CACHE_LINE_SIZE; |
| + } |
| } |
| if (priv->tso) { |
| uint16_t max_tso_inline = ((MLX5_MAX_TSO_HEADER + |
| -- |
| 2.7.4 |
| |