Add RDMA ibverb driver plugin
RDMA ibverb is a userspace API to efficiently rx/tx packets. This is an
initial, unoptimized driver targeting Mellanox cards.
Next steps should include batching, multiqueue and additional cards.
Change-Id: I0309c7a543f75f2f9317eaf63ca502ac7a093ef9
Signed-off-by: Benoît Ganne <bganne@cisco.com>
diff --git a/build/external/Makefile b/build/external/Makefile
index a1352a6..084d694 100644
--- a/build/external/Makefile
+++ b/build/external/Makefile
@@ -31,11 +31,18 @@
include packages/nasm.mk
include packages/ipsec-mb.mk
include packages/dpdk.mk
+include packages/rdma-core.mk
.PHONY: clean
clean:
@rm -rf $(B) $(I)
+.PHONY: install
+install: dpdk-install rdma-core-install
+
+.PHONY: config
+config: dpdk-config rdma-core-config
+
##############################################################################
# .deb packaging
##############################################################################
@@ -62,11 +69,6 @@
install-deb:
ifneq ($(INSTALLED_VER),$(DEB_VER)-$(PKG_SUFFIX))
- @echo "=========================================================="
- @echo " Out of date vpp-ext-deps package installed."
- @echo " Installed: $(INSTALLED_VER)"
- @echo " Needed: $(DEB_VER)-$(PKG_SUFFIX)"
- @echo "=========================================================="
@make $(DEV_DEB)
@sudo dpkg -i $(DEV_DEB)
else
@@ -78,9 +80,9 @@
check-deb:
ifneq ($(INSTALLED_VER),$(DEB_VER)-$(PKG_SUFFIX))
@echo "=========================================================="
- @echo " Outdated DPDK package detected:"
- @echo " Installed: vpp-ext-deps $(INSTALLED_VER)"
- @echo " Current: vpp-ext-deps $(DEB_VER)-$(PKG_SUFFIX)"
+ @echo " Out of date vpp-ext-deps package installed."
+ @echo " Installed: $(INSTALLED_VER)"
+ @echo " Needed: $(DEB_VER)-$(PKG_SUFFIX)"
@echo ""
@echo " Please upgrade by invoking 'make install-ext-deps'"
@echo " from the top level directory."
@@ -115,16 +117,16 @@
sudo rpm -Uih --force $(DEV_RPM)
else
@echo "=========================================================="
- @echo " Up-to-date DPDK package already installed"
+ @echo " Up-to-date vpp-ext-deps package already installed"
@echo "=========================================================="
endif
check-rpm:
ifneq ($(INSTALLED_RPM_VER),$(RPM_VER)-$(PKG_SUFFIX))
@echo "=========================================================="
- @echo " Outdated DPDK package detected:"
- @echo " Installed: vpp-ext-deps $(INSTALLED_RPM_VER)"
- @echo " Current: vpp-ext-deps $(RPM_VER)-$(PKG_SUFFIX)"
+ @echo " Out of date vpp-ext-deps package installed."
+ @echo " Installed: $(INSTALLED_RPM_VER)"
+ @echo " Needed: $(RPM_VER)-$(PKG_SUFFIX)"
@echo ""
@echo " Please upgrade by invoking 'make install-ext-deps'"
@echo " from the top level directory."
@@ -140,9 +142,9 @@
ebuild-build:
ifeq ($(INSTALLED_VER)$(INSTALLED_RPM_VER),)
@echo "=========================================================="
- @echo "Building DPDK from source. Consider installing development"
- @echo "package by invoking 'make install-ext-deps' from the"
- @echo "top level directory"
+ @echo "Building vpp-ext-deps from source. Consider installing"
+ @echo "development package by invoking 'make install-ext-deps'"
+ @echo "from the top level directory"
@echo "=========================================================="
make config
else
diff --git a/build/external/deb/debian/rules b/build/external/deb/debian/rules
index 6393f82..2b1157e 100755
--- a/build/external/deb/debian/rules
+++ b/build/external/deb/debian/rules
@@ -20,7 +20,6 @@
make $(MAKE_ARGS) clean
override_dh_auto_configure:
- make $(MAKE_ARGS) config
override_dh_install:
make $(MAKE_ARGS) install
diff --git a/build/external/packages.mk b/build/external/packages.mk
index 4056b2f..005c2a9 100644
--- a/build/external/packages.mk
+++ b/build/external/packages.mk
@@ -31,12 +31,12 @@
downloads/$($1_tarball):
mkdir -p downloads
@if [ -e $(DL_CACHE_DIR)/$($1_tarball) ] ; \
- then cp $(DL_CACHE_DIR)/$($1_tarball) downloads/ ; \
+ then cp $(DL_CACHE_DIR)/$($1_tarball) $$@ ; \
else \
echo "Downloading $($1_url)" ; \
- curl -o downloads/$($1_tarball) -LO $($1_url) ; \
+ curl -o $$@ -LO $($1_url) ; \
fi
- @rm -f $(B)/.download.ok
+ @rm -f $(B)/.$1.download.ok
$(B)/.$1.download.ok: downloads/$($1_tarball)
@mkdir -p $(B)
diff --git a/build/external/packages/dpdk.mk b/build/external/packages/dpdk.mk
index 68c2767..ae9d9c5 100644
--- a/build/external/packages/dpdk.mk
+++ b/build/external/packages/dpdk.mk
@@ -167,9 +167,7 @@
fi
endef
-all: build
-
-$(B)/custom-config: $(B)/.patch.ok Makefile
+$(B)/custom-config: $(B)/.dpdk-patch.ok Makefile
@echo --- generating custom config from $(DPDK_SOURCE)/config/defconfig_$(DPDK_TARGET) ---
@cpp -undef -ffreestanding -x assembler-with-cpp $(DPDK_SOURCE)/config/defconfig_$(DPDK_TARGET) $@
$(call set,RTE_MACHINE,$(DPDK_MACHINE))
@@ -230,18 +228,19 @@
$(call set,RTE_LIBRTE_DPAA_PMD,n)
$(call set,RTE_LIBRTE_PMD_DPAA_SEC,n)
$(call set,RTE_LIBRTE_PMD_DPAA_EVENTDEV,n)
- @rm -f .config.ok
+ @rm -f .dpdk-config.ok
-$(CURDIR)/$(DPDK_TARBALL):
+DPDK_DOWNLOADS = $(CURDIR)/downloads/$(DPDK_TARBALL)
+
+$(DPDK_DOWNLOADS):
+ mkdir -p downloads
@if [ -e $(DPDK_DOWNLOAD_DIR)/$(DPDK_TARBALL) ] ; \
- then cp $(DPDK_DOWNLOAD_DIR)/$(DPDK_TARBALL) $(CURDIR) ; \
- else curl -o $(CURDIR)/$(DPDK_TARBALL) -LO $(DPDK_TAR_URL) ; \
+ then cp $(DPDK_DOWNLOAD_DIR)/$(DPDK_TARBALL) $@ ; \
+ else curl -o $@ -LO $(DPDK_TAR_URL) ; \
fi
- @rm -f $(B)/.download.ok
+ @rm -f $(B)/.dpdk-download.ok
-DPDK_DOWNLOADS = $(CURDIR)/$(DPDK_TARBALL)
-
-$(B)/.download.ok: $(DPDK_DOWNLOADS)
+$(B)/.dpdk-download.ok: $(DPDK_DOWNLOADS)
@mkdir -p $(B)
@openssl md5 $< | cut -f 2 -d " " - > $(B)/$(DPDK_TARBALL).md5sum
@([ "$$(<$(B)/$(DPDK_TARBALL).md5sum)" = "$(DPDK_$(DPDK_VERSION)_TARBALL_MD5_CKSUM)" ] || \
@@ -249,18 +248,18 @@
rm $(B)/$(DPDK_TARBALL).md5sum && false ))
@touch $@
-.PHONY: download
-download: $(B)/.download.ok
+.PHONY: dpdk-download
+dpdk-download: $(B)/.dpdk-download.ok
-$(B)/.extract.ok: $(B)/.download.ok
+$(B)/.dpdk-extract.ok: $(B)/.dpdk-download.ok
@echo --- extracting $(DPDK_TARBALL) ---
- @tar --directory $(B) --extract --file $(CURDIR)/$(DPDK_TARBALL)
+ @tar --directory $(B) --extract --file $(DPDK_DOWNLOADS)
@touch $@
-.PHONY: extract
-extract: $(B)/.extract.ok
+.PHONY: dpdk-extract
+dpdk-extract: $(B)/.dpdk-extract.ok
-$(B)/.patch.ok: $(B)/.extract.ok
+$(B)/.dpdk-patch.ok: $(B)/.dpdk-extract.ok
ifneq ($(wildcard $(CURDIR)/patches/dpdk_$(DPDK_VERSION)/*.patch),)
@echo --- patching ---
@for f in $(CURDIR)/patches/dpdk_$(DPDK_VERSION)/*.patch ; do \
@@ -270,26 +269,23 @@
endif
@touch $@
-.PHONY: patch
-patch: $(B)/.patch.ok
+.PHONY: dpdk-patch
+dpdk-patch: $(B)/.dpdk-patch.ok
-$(B)/.config.ok: $(B)/.patch.ok $(B)/custom-config
+$(B)/.dpdk-config.ok: $(B)/.dpdk-patch.ok $(B)/custom-config
@make $(DPDK_MAKE_ARGS) config
@touch $@
-.PHONY: config
-config: $(B)/.config.ok
+.PHONY: dpdk-config
+dpdk-config: $(B)/.dpdk-config.ok
-.PHONY: build-dpdk
-build-dpdk: $(DPDK_BUILD_DEPS)
- @if [ ! -e $(B)/.config.ok ] ; then echo 'Please run "make config" first' && false ; fi
+$(B)/.dpdk-build.ok: dpdk-config $(DPDK_BUILD_DEPS)
+ @if [ ! -e $(B)/.dpdk-config.ok ] ; then echo 'Please run "make config" first' && false ; fi
@make $(DPDK_MAKE_ARGS) install
-
-$(B)/.build.ok: build-dpdk
@touch $@
-.PHONY: build
-build: $(B)/.build.ok
+.PHONY: dpdk-build
+dpdk-build: $(B)/.dpdk-build.ok
-.PHONY: install
-install: $(B)/.build.ok
+.PHONY: dpdk-install
+dpdk-install: $(B)/.dpdk-build.ok
diff --git a/build/external/packages/rdma-core.mk b/build/external/packages/rdma-core.mk
new file mode 100644
index 0000000..0e8c878
--- /dev/null
+++ b/build/external/packages/rdma-core.mk
@@ -0,0 +1,46 @@
+# Copyright (c) 2018 Cisco and/or its affiliates.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at:
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+rdma-core_version := 23
+rdma-core_tarball := rdma-core-$(rdma-core_version).tar.gz
+rdma-core_tarball_md5sum_22.1 := dde4d30e3db20893408ae51041117034
+rdma-core_tarball_md5sum_23 := c78575735c4a71609c1a214ea16cd8dc
+rdma-core_tarball_md5sum := $(rdma-core_tarball_md5sum_$(rdma-core_version))
+rdma-core_tarball_strip_dirs := 1
+rdma-core_url := http://github.com/linux-rdma/rdma-core/releases/download/v$(rdma-core_version)/$(rdma-core_tarball)
+
+RDMA_FILES := include/infiniband/verbs.h \
+ include/infiniband/verbs_api.h \
+ include/infiniband/ib_user_ioctl_verbs.h \
+ include/rdma/ib_user_verbs.h \
+ lib/statics/libibverbs.a \
+ lib/statics/libmlx5.a
+
+define rdma-core_config_cmds
+ cd $(rdma-core_build_dir) && \
+ cmake -G Ninja $(rdma-core_src_dir) \
+ -DENABLE_STATIC=1 -DENABLE_RESOLVE_NEIGH=0 -DNO_PYVERBS=1 -DENABLE_VALGRIND=0 \
+ -DCMAKE_BUILD_TYPE=RelWithDebInfo \
+ -DCMAKE_C_FLAGS=-fPIC > $(rdma-core_config_log)
+endef
+
+define rdma-core_build_cmds
+ cmake --build $(rdma-core_build_dir) -- libibverbs.a libmlx5.a > $(rdma-core_build_log)
+endef
+
+define rdma-core_install_cmds
+ mkdir -p $(rdma-core_install_dir)
+ tar -C $(rdma-core_build_dir) --xform='s|/statics/|/|' -hc $(RDMA_FILES) | tar -C $(rdma-core_install_dir) -xv > $(rdma-core_install_log)
+endef
+
+$(eval $(call package,rdma-core))
diff --git a/src/plugins/rdma/CMakeLists.txt b/src/plugins/rdma/CMakeLists.txt
new file mode 100644
index 0000000..35d43db
--- /dev/null
+++ b/src/plugins/rdma/CMakeLists.txt
@@ -0,0 +1,61 @@
+# Copyright (c) 2018 Cisco and/or its affiliates.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at:
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+message(STATUS "RDMA plugins - looking for ibverbs")
+
+find_path(IBVERBS_INCLUDE_DIR NAMES infiniband/verbs.h)
+find_library(IBVERBS_LIB NAMES libibverbs.a)
+find_library(MLX5_LIB NAMES libmlx5.a)
+
+if (NOT IBVERBS_LIB OR NOT MLX5_LIB)
+ message(WARNING "RDMA plugins - ibverbs not found - rdma_plugin disabled")
+ return()
+endif()
+
+if (MLX5_LIB)
+ string_append(RDMA_LINK_FLAGS "-Wl,--whole-archive,${MLX5_LIB},--no-whole-archive")
+endif()
+
+set(CMAKE_REQUIRED_FLAGS "-fPIC -shared ${IBVERBS_LIB} ${RDMA_LINK_FLAGS}")
+CHECK_C_SOURCE_COMPILES("" IBVERBS_COMPILES_CHECK)
+
+if (NOT IBVERBS_COMPILES_CHECK)
+ message(WARNING "RDMA plugins - no working ibverbs found - rdma_plugin disabled")
+ return()
+endif()
+
+message(STATUS "RDMA plugins - found ${IBVERBS_INCLUDE_DIR}")
+message(STATUS "RDMA plugins - found ${IBVERBS_LIB}")
+message(STATUS "RDMA plugins - found ${MLX5_LIB}")
+
+include_directories(${IBVERBS_INCLUDE_DIR})
+
+add_vpp_plugin(rdma
+ SOURCES
+ cli.c
+ device.c
+ format.c
+ plugin.c
+ input.c
+ output.c
+
+ MULTIARCH_SOURCES
+ input.c
+ output.c
+
+ LINK_FLAGS
+ "${RDMA_LINK_FLAGS}"
+
+ LINK_LIBRARIES
+ ${IBVERBS_LIB}
+)
diff --git a/src/plugins/rdma/cli.c b/src/plugins/rdma/cli.c
new file mode 100644
index 0000000..8919603
--- /dev/null
+++ b/src/plugins/rdma/cli.c
@@ -0,0 +1,133 @@
+/*
+ *------------------------------------------------------------------
+ * Copyright (c) 2018 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *------------------------------------------------------------------
+ */
+#include <stdint.h>
+#include <net/if.h>
+#include <sys/ioctl.h>
+#include <inttypes.h>
+
+#include <vlib/vlib.h>
+#include <vlib/unix/unix.h>
+#include <vlib/pci/pci.h>
+#include <vnet/ethernet/ethernet.h>
+
+#include <rdma/rdma.h>
+
+static clib_error_t *
+rdma_create_command_fn (vlib_main_t * vm, unformat_input_t * input,
+ vlib_cli_command_t * cmd)
+{
+ unformat_input_t _line_input, *line_input = &_line_input;
+ rdma_create_if_args_t args;
+
+ clib_memset (&args, 0, sizeof (rdma_create_if_args_t));
+
+ /* Get a line of input. */
+ if (!unformat_user (input, unformat_line_input, line_input))
+ return 0;
+
+ while (unformat_check_input (line_input) != UNFORMAT_END_OF_INPUT)
+ {
+ if (unformat (line_input, "name %s", &args.ifname))
+ ;
+ else
+ return clib_error_return (0, "unknown input `%U'",
+ format_unformat_error, input);
+ }
+ unformat_free (line_input);
+
+ rdma_create_if (vm, &args);
+
+ vec_free (args.ifname);
+
+ return args.error;
+}
+
+/* *INDENT-OFF* */
+VLIB_CLI_COMMAND (rdma_create_command, static) = {
+ .path = "create interface rdma",
+ .short_help = "create interface rdma <name ifname>",
+ .function = rdma_create_command_fn,
+};
+/* *INDENT-ON* */
+
+static clib_error_t *
+rdma_delete_command_fn (vlib_main_t * vm, unformat_input_t * input,
+ vlib_cli_command_t * cmd)
+{
+ unformat_input_t _line_input, *line_input = &_line_input;
+ u32 sw_if_index = ~0;
+ vnet_hw_interface_t *hw;
+ rdma_main_t *rm = &rdma_main;
+ rdma_device_t *rd;
+ vnet_main_t *vnm = vnet_get_main ();
+
+ /* Get a line of input. */
+ if (!unformat_user (input, unformat_line_input, line_input))
+ return 0;
+
+ while (unformat_check_input (line_input) != UNFORMAT_END_OF_INPUT)
+ {
+ if (unformat (line_input, "sw_if_index %d", &sw_if_index))
+ ;
+ else if (unformat (line_input, "%U", unformat_vnet_sw_interface,
+ vnm, &sw_if_index))
+ ;
+ else
+ return clib_error_return (0, "unknown input `%U'",
+ format_unformat_error, input);
+ }
+ unformat_free (line_input);
+
+ if (sw_if_index == ~0)
+ return clib_error_return (0,
+ "please specify interface name or sw_if_index");
+
+ hw = vnet_get_sup_hw_interface (vnm, sw_if_index);
+ if (hw == NULL || rdma_device_class.index != hw->dev_class_index)
+ return clib_error_return (0, "not an AVF interface");
+
+ rd = pool_elt_at_index (rm->devices, hw->dev_instance);
+
+ rdma_delete_if (vm, rd);
+
+ return 0;
+}
+
+/* *INDENT-OFF* */
+VLIB_CLI_COMMAND (rdma_delete_command, static) = {
+ .path = "delete interface rdma",
+ .short_help = "delete interface rdma "
+ "{<interface> | sw_if_index <sw_idx>}",
+ .function = rdma_delete_command_fn,
+};
+/* *INDENT-ON* */
+
+clib_error_t *
+rdma_cli_init (vlib_main_t * vm)
+{
+ return 0;
+}
+
+VLIB_INIT_FUNCTION (rdma_cli_init);
+
+/*
+ * fd.io coding-style-patch-verification: ON
+ *
+ * Local Variables:
+ * eval: (c-set-style "gnu")
+ * End:
+ */
diff --git a/src/plugins/rdma/device.c b/src/plugins/rdma/device.c
new file mode 100644
index 0000000..31112a9
--- /dev/null
+++ b/src/plugins/rdma/device.c
@@ -0,0 +1,607 @@
+/*
+ *------------------------------------------------------------------
+ * Copyright (c) 2018 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *------------------------------------------------------------------
+ */
+
+#include <unistd.h>
+#include <fcntl.h>
+#include <net/if.h>
+#include <linux/if_link.h>
+#include <linux/if_ether.h>
+
+#include <vppinfra/linux/sysfs.h>
+#include <vlib/vlib.h>
+#include <vlib/unix/unix.h>
+#include <vlib/pci/pci.h>
+#include <vnet/ethernet/ethernet.h>
+
+#include <rdma/rdma.h>
+
+rdma_main_t rdma_main;
+
+#define rdma_log_debug(dev, f, ...) \
+{ \
+ vlib_log(VLIB_LOG_LEVEL_DEBUG, rdma_main.log_class, "%U: " f, \
+ format_vlib_pci_addr, &rd->pci_addr, ##__VA_ARGS__); \
+};
+
+static u32
+rdma_flag_change (vnet_main_t * vnm, vnet_hw_interface_t * hw, u32 flags)
+{
+ rdma_main_t *rm = &rdma_main;
+ vlib_log_warn (rm->log_class, "TODO");
+ return 0;
+}
+
+static void
+rdma_update_state (vnet_main_t * vnm, rdma_device_t * rd, int port)
+{
+ struct ibv_port_attr attr;
+ u32 width = 0;
+ u32 speed = 0;
+
+ if (ibv_query_port (rd->ctx, port, &attr))
+ {
+ vnet_hw_interface_set_link_speed (vnm, rd->hw_if_index, 0);
+ vnet_hw_interface_set_flags (vnm, rd->hw_if_index, 0);
+ return;
+ }
+
+ /* update state */
+ switch (attr.state)
+ {
+ case IBV_PORT_ACTIVE: /* fallthrough */
+ case IBV_PORT_ACTIVE_DEFER:
+ rd->flags |= RDMA_DEVICE_F_LINK_UP;
+ vnet_hw_interface_set_flags (vnm, rd->hw_if_index,
+ VNET_HW_INTERFACE_FLAG_LINK_UP);
+ break;
+ default:
+ rd->flags &= ~RDMA_DEVICE_F_LINK_UP;
+ vnet_hw_interface_set_flags (vnm, rd->hw_if_index, 0);
+ break;
+ }
+
+ /* update speed */
+ switch (attr.active_width)
+ {
+ case 1:
+ width = 1;
+ break;
+ case 2:
+ width = 4;
+ break;
+ case 4:
+ width = 8;
+ break;
+ case 8:
+ width = 12;
+ break;
+ }
+ switch (attr.active_speed)
+ {
+ case 1:
+ speed = 2500000;
+ break;
+ case 2:
+ speed = 5000000;
+ break;
+ case 4: /* fallthrough */
+ case 8:
+ speed = 10000000;
+ break;
+ case 16:
+ speed = 14000000;
+ break;
+ case 32:
+ speed = 25000000;
+ break;
+ }
+ vnet_hw_interface_set_link_speed (vnm, rd->hw_if_index, width * speed);
+}
+
+static clib_error_t *
+rdma_async_event_error_ready (clib_file_t * f)
+{
+ rdma_main_t *rm = &rdma_main;
+ rdma_device_t *rd = vec_elt_at_index (rm->devices, f->private_data);
+ return clib_error_return (0, "RDMA async event error for device %U",
+ format_vlib_pci_addr, &rd->pci_addr);
+}
+
+static clib_error_t *
+rdma_async_event_read_ready (clib_file_t * f)
+{
+ vnet_main_t *vnm = vnet_get_main ();
+ rdma_main_t *rm = &rdma_main;
+ rdma_device_t *rd = vec_elt_at_index (rm->devices, f->private_data);
+ int ret;
+ struct ibv_async_event event;
+ ret = ibv_get_async_event (rd->ctx, &event);
+ if (ret < 0)
+ {
+ return clib_error_return_unix (0, "ibv_get_async_event() failed");
+ }
+
+ switch (event.event_type)
+ {
+ case IBV_EVENT_PORT_ACTIVE:
+ rdma_update_state (vnm, rd, event.element.port_num);
+ break;
+ case IBV_EVENT_PORT_ERR:
+ rdma_update_state (vnm, rd, event.element.port_num);
+ break;
+ case IBV_EVENT_DEVICE_FATAL:
+ rd->flags &= ~RDMA_DEVICE_F_LINK_UP;
+ vnet_hw_interface_set_flags (vnm, rd->hw_if_index, 0);
+ vlib_log_emerg (rm->log_class, "Fatal RDMA error for device %U",
+ format_vlib_pci_addr, &rd->pci_addr);
+ break;
+ default:
+ vlib_log_warn (rm->log_class,
+ "Unhandeld RDMA async event %i for device %U",
+ event.event_type, format_vlib_pci_addr, &rd->pci_addr);
+ break;
+ }
+
+ ibv_ack_async_event (&event);
+ return 0;
+}
+
+static clib_error_t *
+rdma_async_event_init (rdma_device_t * rd)
+{
+ clib_file_t t = { 0 };
+ int ret;
+
+ /* make RDMA async event fd non-blocking */
+ ret = fcntl (rd->ctx->async_fd, F_GETFL);
+ if (ret < 0)
+ {
+ return clib_error_return_unix (0, "fcntl(F_GETFL) failed");
+ }
+ ret = fcntl (rd->ctx->async_fd, F_SETFL, ret | O_NONBLOCK);
+ if (ret < 0)
+ {
+ return clib_error_return_unix (0, "fcntl(F_SETFL, O_NONBLOCK) failed");
+ }
+
+ /* register RDMA async event fd */
+ t.read_function = rdma_async_event_read_ready;
+ t.file_descriptor = rd->ctx->async_fd;
+ t.error_function = rdma_async_event_error_ready;
+ t.private_data = rd->dev_instance;
+ t.description =
+ format (0, "RMDA %U async event", format_vlib_pci_addr, &rd->pci_addr);
+
+ rd->async_event_clib_file_index = clib_file_add (&file_main, &t);
+
+ return 0;
+}
+
+static void
+rdma_async_event_cleanup (rdma_device_t * rd)
+{
+ clib_file_del_by_index (&file_main, rd->async_event_clib_file_index);
+}
+
+static clib_error_t *
+rdma_register_interface (vnet_main_t * vnm, rdma_device_t * rd)
+{
+ return ethernet_register_interface (vnm, rdma_device_class.index,
+ rd->dev_instance, rd->hwaddr,
+ &rd->hw_if_index, rdma_flag_change);
+}
+
+static void
+rdma_unregister_interface (vnet_main_t * vnm, rdma_device_t * rd)
+{
+ vnet_hw_interface_set_flags (vnm, rd->hw_if_index, 0);
+ vnet_hw_interface_unassign_rx_thread (vnm, rd->hw_if_index, 0);
+ ethernet_delete_interface (vnm, rd->hw_if_index);
+}
+
+static void
+rdma_dev_cleanup (rdma_device_t * rd)
+{
+ rdma_main_t *rm = &rdma_main;
+ rdma_rxq_t *rxq;
+ rdma_txq_t *txq;
+
+#define _(fn, arg) if (arg) \
+ { \
+ int rv; \
+ if ((rv = fn (arg))) \
+ rdma_log_debug (rd, #fn "() failed (rv = %d)", rv); \
+ }
+
+ _(ibv_destroy_flow, rd->flow_mcast);
+ _(ibv_destroy_flow, rd->flow_ucast);
+ _(ibv_dereg_mr, rd->mr);
+ vec_foreach (txq, rd->txqs)
+ {
+ _(ibv_destroy_qp, txq->qp);
+ _(ibv_destroy_cq, txq->cq);
+ }
+ vec_foreach (rxq, rd->rxqs)
+ {
+ _(ibv_destroy_qp, rxq->qp);
+ _(ibv_destroy_cq, rxq->cq);
+ }
+ _(ibv_dealloc_pd, rd->pd);
+ _(ibv_close_device, rd->ctx);
+#undef _
+
+ clib_error_free (rd->error);
+
+ vec_free (rd->rxqs);
+ vec_free (rd->txqs);
+ pool_put (rm->devices, rd);
+}
+
+static clib_error_t *
+rdma_rxq_init (vlib_main_t * vm, rdma_device_t * rd, u16 qid, u32 n_desc)
+{
+ rdma_rxq_t *rxq;
+ struct ibv_qp_init_attr qpia;
+ struct ibv_qp_attr qpa;
+ int qp_flags;
+
+ vec_validate_aligned (rd->rxqs, qid, CLIB_CACHE_LINE_BYTES);
+ rxq = vec_elt_at_index (rd->rxqs, qid);
+ rxq->size = n_desc;
+
+ if ((rxq->cq = ibv_create_cq (rd->ctx, n_desc, NULL, NULL, 0)) == 0)
+ return clib_error_return_unix (0, "Create CQ Failed");
+
+ memset (&qpia, 0, sizeof (qpia));
+ qpia.qp_type = IBV_QPT_RAW_PACKET;
+ qpia.send_cq = rxq->cq;
+ qpia.recv_cq = rxq->cq;
+ qpia.cap.max_recv_wr = n_desc;
+ qpia.cap.max_recv_sge = 1;
+
+ if ((rxq->qp = ibv_create_qp (rd->pd, &qpia)) == 0)
+ return clib_error_return_unix (0, "Queue Pair create failed");
+
+ memset (&qpa, 0, sizeof (qpa));
+ qp_flags = IBV_QP_STATE | IBV_QP_PORT;
+ qpa.qp_state = IBV_QPS_INIT;
+ qpa.port_num = 1;
+ if (ibv_modify_qp (rxq->qp, &qpa, qp_flags) != 0)
+ return clib_error_return_unix (0, "Modify QP (init) Failed");
+
+ memset (&qpa, 0, sizeof (qpa));
+ qp_flags = IBV_QP_STATE;
+ qpa.qp_state = IBV_QPS_RTR;
+ if (ibv_modify_qp (rxq->qp, &qpa, qp_flags) != 0)
+ return clib_error_return_unix (0, "Modify QP (receive) Failed");
+
+ return 0;
+}
+
+static clib_error_t *
+rdma_txq_init (vlib_main_t * vm, rdma_device_t * rd, u16 qid, u32 n_desc)
+{
+ rdma_txq_t *txq;
+ struct ibv_qp_init_attr qpia;
+ struct ibv_qp_attr qpa;
+ int qp_flags;
+
+ vec_validate_aligned (rd->txqs, qid, CLIB_CACHE_LINE_BYTES);
+ txq = vec_elt_at_index (rd->txqs, qid);
+ txq->size = n_desc;
+
+ if ((txq->cq = ibv_create_cq (rd->ctx, n_desc, NULL, NULL, 0)) == 0)
+ return clib_error_return_unix (0, "Create CQ Failed");
+
+ memset (&qpia, 0, sizeof (qpia));
+ qpia.qp_type = IBV_QPT_RAW_PACKET;
+ qpia.send_cq = txq->cq;
+ qpia.recv_cq = txq->cq;
+ qpia.cap.max_send_wr = n_desc;
+ qpia.cap.max_send_sge = 1;
+
+ if ((txq->qp = ibv_create_qp (rd->pd, &qpia)) == 0)
+ return clib_error_return_unix (0, "Queue Pair create failed");
+
+ memset (&qpa, 0, sizeof (qpa));
+ qp_flags = IBV_QP_STATE | IBV_QP_PORT;
+ qpa.qp_state = IBV_QPS_INIT;
+ qpa.port_num = 1;
+ if (ibv_modify_qp (txq->qp, &qpa, qp_flags) != 0)
+ return clib_error_return_unix (0, "Modify QP (init) Failed");
+
+ memset (&qpa, 0, sizeof (qpa));
+ qp_flags = IBV_QP_STATE;
+ qpa.qp_state = IBV_QPS_RTR;
+ if (ibv_modify_qp (txq->qp, &qpa, qp_flags) != 0)
+ return clib_error_return_unix (0, "Modify QP (receive) Failed");
+
+ memset (&qpa, 0, sizeof (qpa));
+ qp_flags = IBV_QP_STATE;
+ qpa.qp_state = IBV_QPS_RTS;
+ if (ibv_modify_qp (txq->qp, &qpa, qp_flags) != 0)
+ return clib_error_return_unix (0, "Modify QP (send) Failed");
+ return 0;
+}
+
+static clib_error_t *
+rdma_dev_init (vlib_main_t * vm, rdma_device_t * rd)
+{
+ clib_error_t *err;
+ vlib_buffer_main_t *bm = vm->buffer_main;
+ vlib_thread_main_t *tm = vlib_get_thread_main ();
+ u16 i;
+
+ if (rd->ctx == 0)
+ return clib_error_return_unix (0, "Device Open Failed");
+
+ if ((rd->pd = ibv_alloc_pd (rd->ctx)) == 0)
+ return clib_error_return_unix (0, "PD Alloc Failed");
+
+ if ((err = rdma_rxq_init (vm, rd, 0, 512)))
+ return err;
+
+ for (i = 0; i < tm->n_vlib_mains; i++)
+ if ((err = rdma_txq_init (vm, rd, i, 512)))
+ return err;
+
+ if ((rd->mr = ibv_reg_mr (rd->pd, (void *) bm->buffer_mem_start,
+ bm->buffer_mem_size,
+ IBV_ACCESS_LOCAL_WRITE)) == 0)
+ return clib_error_return_unix (0, "Register MR Failed");
+
+ ethernet_mac_address_generate (rd->hwaddr);
+
+ /*
+ * restrict packets steering to our MAC
+ * allows to share a single HW NIC with multiple RDMA ifaces
+ * and/or Linux
+ */
+ struct raw_eth_flow_attr
+ {
+ struct ibv_flow_attr attr;
+ struct ibv_flow_spec_eth spec_eth;
+ } __attribute__ ((packed)) fa;
+ memset (&fa, 0, sizeof (fa));
+ fa.attr.num_of_specs = 1;
+ fa.attr.port = 1;
+ fa.spec_eth.type = IBV_FLOW_SPEC_ETH;
+ fa.spec_eth.size = sizeof (struct ibv_flow_spec_eth);
+ memcpy (fa.spec_eth.val.dst_mac, rd->hwaddr,
+ sizeof (fa.spec_eth.val.dst_mac));
+ memset (fa.spec_eth.mask.dst_mac, 0xff, sizeof (fa.spec_eth.mask.dst_mac));
+ if ((rd->flow_ucast = ibv_create_flow (rd->rxqs[0].qp, &fa.attr)) == 0)
+ return clib_error_return_unix (0, "create Flow Failed");
+
+ /* receive multicast packets too */
+ memset (&fa, 0, sizeof (fa));
+ fa.attr.num_of_specs = 1;
+ fa.attr.port = 1;
+ fa.attr.flags = IBV_FLOW_ATTR_FLAGS_DONT_TRAP; /* let others receive them too */
+ fa.spec_eth.type = IBV_FLOW_SPEC_ETH;
+ fa.spec_eth.size = sizeof (struct ibv_flow_spec_eth);
+ fa.spec_eth.val.dst_mac[0] = 1;
+ fa.spec_eth.mask.dst_mac[0] = 1;
+ if ((rd->flow_mcast = ibv_create_flow (rd->rxqs[0].qp, &fa.attr)) == 0)
+ return clib_error_return_unix (0, "create Flow Failed");
+
+ return 0;
+}
+
+static uword
+sysfs_path_to_pci_addr (char *path, vlib_pci_addr_t * addr)
+{
+ uword rv;
+ unformat_input_t in;
+ u8 *s;
+
+ s = clib_sysfs_link_to_name (path);
+ unformat_init_string (&in, (char *) s, strlen ((char *) s));
+ rv = unformat (&in, "%U", unformat_vlib_pci_addr, addr);
+ unformat_free (&in);
+ vec_free (s);
+ return rv;
+}
+
+void
+rdma_create_if (vlib_main_t * vm, rdma_create_if_args_t * args)
+{
+ vnet_main_t *vnm = vnet_get_main ();
+ rdma_main_t *rm = &rdma_main;
+ rdma_device_t *rd = 0;
+ struct ibv_device **dev_list = 0;
+ int n_devs;
+ u8 *s = 0, *s2 = 0;
+
+ pool_get_zero (rm->devices, rd);
+ rd->dev_instance = rd - rm->devices;
+ rd->per_interface_next_index = ~0;
+
+ /* check if device exist and if it is bound to mlx5_core */
+ s = format (s, "/sys/class/net/%s/device/driver/module%c", args->ifname, 0);
+ s2 = clib_sysfs_link_to_name ((char *) s);
+
+ if (s2 == 0 || strncmp ((char *) s2, "mlx5_core", 9) != 0)
+ {
+ args->error =
+ clib_error_return (0,
+ "invalid interface (only mlx5 supported for now)");
+ goto err0;
+ }
+
+ /* extract PCI address */
+ vec_reset_length (s);
+ s = format (s, "/sys/class/net/%s/device%c", args->ifname, 0);
+ if (sysfs_path_to_pci_addr ((char *) s, &rd->pci_addr) == 0)
+ {
+ args->error = clib_error_return (0, "cannot find PCI address");
+ goto err0;
+ }
+
+ dev_list = ibv_get_device_list (&n_devs);
+ if (n_devs == 0)
+ {
+ args->error =
+ clib_error_return_unix (0,
+ "no RDMA devices available, errno = %d. Is the ib_uverbs module loaded?",
+ errno);
+ goto err1;
+ }
+
+ for (int i = 0; i < n_devs; i++)
+ {
+ vlib_pci_addr_t addr;
+
+ vec_reset_length (s);
+ s = format (s, "%s/device%c", dev_list[i]->dev_path, 0);
+
+ if (sysfs_path_to_pci_addr ((char *) s, &addr) == 0)
+ continue;
+
+ if (addr.as_u32 != rd->pci_addr.as_u32)
+ continue;
+
+ if ((rd->ctx = ibv_open_device (dev_list[i])))
+ break;
+ }
+
+ if ((args->error = rdma_dev_init (vm, rd)))
+ goto err2;
+
+ if ((args->error = rdma_register_interface (vnm, rd)))
+ goto err2;
+
+ if ((args->error = rdma_async_event_init (rd)))
+ goto err3;
+
+ rdma_update_state (vnm, rd, 1);
+
+ vnet_sw_interface_t *sw = vnet_get_hw_sw_interface (vnm, rd->hw_if_index);
+ args->sw_if_index = rd->sw_if_index = sw->sw_if_index;
+ /*
+ * FIXME: add support for interrupt mode
+ * vnet_hw_interface_t *hw = vnet_get_hw_interface (vnm, rd->hw_if_index);
+ * hw->flags |= VNET_HW_INTERFACE_FLAG_SUPPORTS_INT_MODE;
+ */
+ vnet_hw_interface_set_input_node (vnm, rd->hw_if_index,
+ rdma_input_node.index);
+ vnet_hw_interface_assign_rx_thread (vnm, rd->hw_if_index, 0, ~0);
+ return;
+
+err3:
+ rdma_unregister_interface (vnm, rd);
+err2:
+ rdma_dev_cleanup (rd);
+err1:
+ ibv_free_device_list (dev_list);
+err0:
+ vec_free (s2);
+ vec_free (s);
+ args->rv = VNET_API_ERROR_INVALID_INTERFACE;
+ vlib_log_err (rm->log_class, "%U", format_clib_error, args->error);
+}
+
+void
+rdma_delete_if (vlib_main_t * vm, rdma_device_t * rd)
+{
+ rdma_async_event_cleanup (rd);
+ rdma_unregister_interface (vnet_get_main (), rd);
+ rdma_dev_cleanup (rd);
+}
+
+static clib_error_t *
+rdma_interface_admin_up_down (vnet_main_t * vnm, u32 hw_if_index, u32 flags)
+{
+ vnet_hw_interface_t *hi = vnet_get_hw_interface (vnm, hw_if_index);
+ rdma_main_t *rm = &rdma_main;
+ rdma_device_t *rd = vec_elt_at_index (rm->devices, hi->dev_instance);
+ uword is_up = (flags & VNET_SW_INTERFACE_FLAG_ADMIN_UP) != 0;
+
+ if (rd->flags & RDMA_DEVICE_F_ERROR)
+ return clib_error_return (0, "device is in error state");
+
+ if (is_up)
+ {
+ vnet_hw_interface_set_flags (vnm, rd->hw_if_index,
+ VNET_HW_INTERFACE_FLAG_LINK_UP);
+ rd->flags |= RDMA_DEVICE_F_ADMIN_UP;
+ }
+ else
+ {
+ vnet_hw_interface_set_flags (vnm, rd->hw_if_index, 0);
+ rd->flags &= ~RDMA_DEVICE_F_ADMIN_UP;
+ }
+ return 0;
+}
+
+static void
+rdma_set_interface_next_node (vnet_main_t * vnm, u32 hw_if_index,
+ u32 node_index)
+{
+ rdma_main_t *rm = &rdma_main;
+ vnet_hw_interface_t *hw = vnet_get_hw_interface (vnm, hw_if_index);
+ rdma_device_t *rd = pool_elt_at_index (rm->devices, hw->dev_instance);
+
+ /* Shut off redirection */
+ if (node_index == ~0)
+ {
+ rd->per_interface_next_index = node_index;
+ return;
+ }
+
+ rd->per_interface_next_index =
+ vlib_node_add_next (vlib_get_main (), rdma_input_node.index, node_index);
+}
+
+static char *rdma_tx_func_error_strings[] = {
+#define _(n,s) s,
+ foreach_rdma_tx_func_error
+#undef _
+};
+
+/* *INDENT-OFF* */
+VNET_DEVICE_CLASS (rdma_device_class,) =
+{
+ .name = "RDMA interface",
+ .format_device = format_rdma_device,
+ .format_device_name = format_rdma_device_name,
+ .admin_up_down_function = rdma_interface_admin_up_down,
+ .rx_redirect_to_node = rdma_set_interface_next_node,
+ .tx_function_n_errors = RDMA_TX_N_ERROR,
+ .tx_function_error_strings = rdma_tx_func_error_strings,
+};
+/* *INDENT-ON* */
+
+clib_error_t *
+rdma_init (vlib_main_t * vm)
+{
+ rdma_main_t *rm = &rdma_main;
+
+ rm->log_class = vlib_log_register_class ("rdma", 0);
+
+ return 0;
+}
+
+VLIB_INIT_FUNCTION (rdma_init);
+
+/*
+ * fd.io coding-style-patch-verification: ON
+ *
+ * Local Variables:
+ * eval: (c-set-style "gnu")
+ * End:
+ */
diff --git a/src/plugins/rdma/format.c b/src/plugins/rdma/format.c
new file mode 100644
index 0000000..7ef65d4
--- /dev/null
+++ b/src/plugins/rdma/format.c
@@ -0,0 +1,89 @@
+/*
+ *------------------------------------------------------------------
+ * Copyright (c) 2018 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *------------------------------------------------------------------
+ */
+
+#include <vlib/vlib.h>
+#include <vlib/unix/unix.h>
+#include <vlib/pci/pci.h>
+#include <vnet/ethernet/ethernet.h>
+
+#include <rdma/rdma.h>
+
+u8 *
+format_rdma_device_name (u8 * s, va_list * args)
+{
+ u32 i = va_arg (*args, u32);
+ rdma_main_t *rm = &rdma_main;
+ rdma_device_t *rd = vec_elt_at_index (rm->devices, i);
+
+ s = format (s, "rdma-%u", rd->dev_instance);
+ return s;
+}
+
+u8 *
+format_rdma_device_flags (u8 * s, va_list * args)
+{
+ rdma_device_t *rd = va_arg (*args, rdma_device_t *);
+ u8 *t = 0;
+
+#define _(a, b, c) if (rd->flags & (1 << a)) \
+t = format (t, "%s%s", t ? " ":"", c);
+ foreach_rdma_device_flags
+#undef _
+ s = format (s, "%v", t);
+ vec_free (t);
+ return s;
+}
+
+u8 *
+format_rdma_device (u8 * s, va_list * args)
+{
+ u32 i = va_arg (*args, u32);
+ rdma_main_t *rm = &rdma_main;
+ rdma_device_t *rd = vec_elt_at_index (rm->devices, i);
+ u32 indent = format_get_indent (s);
+
+ s = format (s, "flags: %U", format_rdma_device_flags, rd);
+ if (rd->error)
+ s = format (s, "\n%Uerror %U", format_white_space, indent,
+ format_clib_error, rd->error);
+
+ return s;
+}
+
+u8 *
+format_rdma_input_trace (u8 * s, va_list * args)
+{
+ vlib_main_t *vm = va_arg (*args, vlib_main_t *);
+ vlib_node_t *node = va_arg (*args, vlib_node_t *);
+ rdma_input_trace_t *t = va_arg (*args, rdma_input_trace_t *);
+ vnet_main_t *vnm = vnet_get_main ();
+ vnet_hw_interface_t *hi = vnet_get_hw_interface (vnm, t->hw_if_index);
+
+ s = format (s, "rdma: %v (%d) next-node %U",
+ hi->name, t->hw_if_index, format_vlib_next_node_name, vm,
+ node->index, t->next_index);
+
+ return s;
+}
+
+/*
+ * fd.io coding-style-patch-verification: ON
+ *
+ * Local Variables:
+ * eval: (c-set-style "gnu")
+ * End:
+ */
diff --git a/src/plugins/rdma/input.c b/src/plugins/rdma/input.c
new file mode 100644
index 0000000..001d1c5
--- /dev/null
+++ b/src/plugins/rdma/input.c
@@ -0,0 +1,202 @@
+/*
+ *------------------------------------------------------------------
+ * Copyright (c) 2018 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *------------------------------------------------------------------
+ */
+
+#include <vlib/vlib.h>
+#include <vlib/unix/unix.h>
+#include <vlib/pci/pci.h>
+#include <vnet/ethernet/ethernet.h>
+#include <vnet/devices/devices.h>
+
+#include <rdma/rdma.h>
+
+#define foreach_rdma_input_error \
+ _(BUFFER_ALLOC, "buffer alloc error")
+
+typedef enum
+{
+#define _(f,s) RDMA_INPUT_ERROR_##f,
+ foreach_rdma_input_error
+#undef _
+ RDMA_INPUT_N_ERROR,
+} rdma_input_error_t;
+
+static __clib_unused char *rdma_input_error_strings[] = {
+#define _(n,s) s,
+ foreach_rdma_input_error
+#undef _
+};
+
+static_always_inline void
+rdma_device_input_refill (vlib_main_t * vm, rdma_device_t * rd,
+ rdma_rxq_t * rxq)
+{
+ u32 n_alloc, n;
+ struct ibv_sge sg_entry;
+ struct ibv_recv_wr wr, *bad_wr;
+ u32 buffers[VLIB_FRAME_SIZE];
+
+ if (rxq->n_enq >= rxq->size)
+ return;
+
+ n_alloc = clib_min (VLIB_FRAME_SIZE, rxq->size - rxq->n_enq);
+ n_alloc = vlib_buffer_alloc (vm, buffers, n_alloc);
+
+ sg_entry.length = vlib_buffer_get_default_data_size (vm);
+ sg_entry.lkey = rd->mr->lkey;
+ wr.num_sge = 1;
+ wr.sg_list = &sg_entry;
+ wr.next = NULL;
+ for (n = 0; n < n_alloc; n++)
+ {
+ vlib_buffer_t *b = vlib_get_buffer (vm, buffers[n]);
+ sg_entry.addr = vlib_buffer_get_va (b);
+ wr.wr_id = buffers[n];
+ if (ibv_post_recv (rxq->qp, &wr, &bad_wr) != 0)
+ vlib_buffer_free (vm, buffers + n, 1);
+ else
+ rxq->n_enq++;
+ }
+}
+
+static_always_inline uword
+rdma_device_input_inline (vlib_main_t * vm, vlib_node_runtime_t * node,
+ vlib_frame_t * frame, rdma_device_t * rd, u16 qid)
+{
+ vnet_main_t *vnm = vnet_get_main ();
+ rdma_rxq_t *rxq = vec_elt_at_index (rd->rxqs, qid);
+ u32 n_trace;
+ struct ibv_wc wc[VLIB_FRAME_SIZE];
+ u32 next_index = VNET_DEVICE_INPUT_NEXT_ETHERNET_INPUT;
+ u32 *bi, *to_next, n_left_to_next;
+ int i;
+ u32 n_rx_packets = 0, n_rx_bytes = 0;
+
+ n_rx_packets = ibv_poll_cq (rxq->cq, VLIB_FRAME_SIZE, wc);
+
+ if (n_rx_packets <= 0)
+ rdma_device_input_refill (vm, rd, rxq);
+
+ if (PREDICT_FALSE (rd->per_interface_next_index != ~0))
+ next_index = rd->per_interface_next_index;
+
+ vlib_get_new_next_frame (vm, node, next_index, to_next, n_left_to_next);
+
+ for (i = 0; i < n_rx_packets; i++)
+ {
+ u32 bi = wc[i].wr_id;
+ vlib_buffer_t *b = vlib_get_buffer (vm, bi);
+ b->current_length = wc[i].byte_len;
+ vnet_buffer (b)->sw_if_index[VLIB_RX] = rd->sw_if_index;
+ vnet_buffer (b)->sw_if_index[VLIB_TX] = ~0;
+ to_next[i] = bi;
+ n_rx_bytes += wc[i].byte_len;
+ }
+
+ if (PREDICT_FALSE ((n_trace = vlib_get_trace_count (vm, node))))
+ {
+ u32 n_left = n_rx_packets, i = 0;
+ bi = to_next;
+
+ while (n_trace && n_left)
+ {
+ vlib_buffer_t *b;
+ rdma_input_trace_t *tr;
+ b = vlib_get_buffer (vm, bi[0]);
+ vlib_trace_buffer (vm, node, next_index, b, /* follow_chain */ 0);
+ tr = vlib_add_trace (vm, node, b, sizeof (*tr));
+ tr->next_index = next_index;
+ tr->hw_if_index = rd->hw_if_index;
+
+ /* next */
+ n_trace--;
+ n_left--;
+ bi++;
+ i++;
+ }
+ vlib_set_trace_count (vm, node, n_trace);
+ }
+
+ if (PREDICT_TRUE (next_index == VNET_DEVICE_INPUT_NEXT_ETHERNET_INPUT))
+ {
+ vlib_next_frame_t *nf;
+ vlib_frame_t *f;
+ ethernet_input_frame_t *ef;
+ nf = vlib_node_runtime_get_next_frame (vm, node, next_index);
+ f = vlib_get_frame (vm, nf->frame_index);
+ f->flags = ETH_INPUT_FRAME_F_SINGLE_SW_IF_IDX;
+
+ ef = vlib_frame_scalar_args (f);
+ ef->sw_if_index = rd->sw_if_index;
+ ef->hw_if_index = rd->hw_if_index;
+ //f->flags |= ETH_INPUT_FRAME_F_IP4_CKSUM_OK;
+ }
+
+ n_left_to_next -= n_rx_packets;
+ vlib_put_next_frame (vm, node, next_index, n_left_to_next);
+
+ vlib_increment_combined_counter
+ (vnm->interface_main.combined_sw_if_counters +
+ VNET_INTERFACE_COUNTER_RX, vm->thread_index,
+ rd->hw_if_index, n_rx_packets, n_rx_bytes);
+
+ rxq->n_enq -= n_rx_packets;
+ rdma_device_input_refill (vm, rd, rxq);
+
+ return n_rx_packets;
+}
+
+VLIB_NODE_FN (rdma_input_node) (vlib_main_t * vm,
+ vlib_node_runtime_t * node,
+ vlib_frame_t * frame)
+{
+ u32 n_rx = 0;
+ rdma_main_t *rm = &rdma_main;
+ vnet_device_input_runtime_t *rt = (void *) node->runtime_data;
+ vnet_device_and_queue_t *dq;
+
+ foreach_device_and_queue (dq, rt->devices_and_queues)
+ {
+ rdma_device_t *rd;
+ rd = vec_elt_at_index (rm->devices, dq->dev_instance);
+ if ((rd->flags & RDMA_DEVICE_F_ADMIN_UP) == 0)
+ continue;
+ n_rx += rdma_device_input_inline (vm, node, frame, rd, dq->queue_id);
+ }
+ return n_rx;
+}
+
+/* *INDENT-OFF* */
+VLIB_REGISTER_NODE (rdma_input_node) = {
+ .name = "rdma-input",
+ .sibling_of = "device-input",
+ .format_trace = format_rdma_input_trace,
+ .type = VLIB_NODE_TYPE_INPUT,
+ .state = VLIB_NODE_STATE_DISABLED,
+ .n_errors = RDMA_INPUT_N_ERROR,
+ .error_strings = rdma_input_error_strings,
+};
+
+/* *INDENT-ON* */
+
+
+/*
+ * fd.io coding-style-patch-verification: ON
+ *
+ * Local Variables:
+ * eval: (c-set-style "gnu")
+ * End:
+ */
diff --git a/src/plugins/rdma/output.c b/src/plugins/rdma/output.c
new file mode 100644
index 0000000..4107843
--- /dev/null
+++ b/src/plugins/rdma/output.c
@@ -0,0 +1,133 @@
+/*
+ *------------------------------------------------------------------
+ * Copyright (c) 2018 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *------------------------------------------------------------------
+ */
+
+#include <vlib/vlib.h>
+#include <vlib/unix/unix.h>
+#include <vlib/pci/pci.h>
+#include <vppinfra/ring.h>
+#include <vnet/ethernet/ethernet.h>
+#include <vnet/devices/devices.h>
+
+#include <rdma/rdma.h>
+
+static_always_inline u16
+rdma_device_output_tx (vlib_main_t * vm, rdma_device_t * rd, rdma_txq_t * txq,
+ u32 * buffers, u16 n_left, u32 * n_tx_packets,
+ u32 * n_tx_bytes)
+{
+ struct ibv_sge sg_entry;
+ struct ibv_send_wr wr, *bad_wr;
+ u16 i;
+
+ for (i = 0; i < n_left; i++)
+ {
+ vlib_buffer_t *b = vlib_get_buffer (vm, buffers[i]);
+ sg_entry.addr = vlib_buffer_get_current_va (b);
+ sg_entry.length = b->current_length;
+ sg_entry.lkey = rd->mr->lkey;
+
+ memset (&wr, 0, sizeof (wr));
+ wr.num_sge = 1;
+ wr.sg_list = &sg_entry;
+ wr.opcode = IBV_WR_SEND;
+ wr.send_flags = IBV_SEND_SIGNALED;
+ wr.wr_id = buffers[i];
+
+ if (ibv_post_send (txq->qp, &wr, &bad_wr) != 0)
+ break;
+
+ *n_tx_bytes += b->current_length;
+ }
+
+ *n_tx_packets += i;
+ return i;
+}
+
+static_always_inline void
+rdma_device_output_free (vlib_main_t * vm, rdma_txq_t * txq)
+{
+ struct ibv_wc wc[VLIB_FRAME_SIZE];
+ u32 to_free[VLIB_FRAME_SIZE];
+ int n_free;
+ int i;
+
+ n_free = ibv_poll_cq (txq->cq, VLIB_FRAME_SIZE, wc);
+ if (n_free <= 0)
+ return;
+
+ for (i = 0; i < n_free; i++)
+ to_free[i] = wc[i].wr_id;
+
+ vlib_buffer_free (vm, to_free, n_free);
+}
+
+VNET_DEVICE_CLASS_TX_FN (rdma_device_class) (vlib_main_t * vm,
+ vlib_node_runtime_t * node,
+ vlib_frame_t * frame)
+{
+ vnet_main_t *vnm = vnet_get_main ();
+ rdma_main_t *rm = &rdma_main;
+ vnet_interface_output_runtime_t *ord = (void *) node->runtime_data;
+ rdma_device_t *rd = pool_elt_at_index (rm->devices, ord->dev_instance);
+ u32 thread_index = vm->thread_index;
+ u8 qid = thread_index;
+ rdma_txq_t *txq = vec_elt_at_index (rd->txqs, qid % vec_len (rd->txqs));
+ u32 *buffers = vlib_frame_vector_args (frame);
+ u16 n_left;
+ u16 n_retry = 5;
+ u32 n_tx_packets = 0, n_tx_bytes = 0;
+
+ clib_spinlock_lock_if_init (&txq->lock);
+
+ n_left = frame->n_vectors;
+
+ while (n_left)
+ {
+ u16 n;
+ rdma_device_output_free (vm, txq);
+ n =
+ rdma_device_output_tx (vm, rd, txq, buffers, n_left, &n_tx_packets,
+ &n_tx_bytes);
+ n_left -= n;
+ buffers += n;
+
+ if (n_left && n_retry--)
+ {
+ vlib_buffer_free (vm, buffers, n_left);
+ vlib_error_count (vm, node->node_index,
+ RDMA_TX_ERROR_NO_FREE_SLOTS, n_left);
+ break;
+ }
+ }
+
+ clib_spinlock_unlock_if_init (&txq->lock);
+
+ vlib_increment_combined_counter
+ (vnm->interface_main.combined_sw_if_counters +
+ VNET_INTERFACE_COUNTER_TX, thread_index,
+ rd->hw_if_index, n_tx_packets, n_tx_bytes);
+
+ return frame->n_vectors - n_left;
+}
+
+/*
+ * fd.io coding-style-patch-verification: ON
+ *
+ * Local Variables:
+ * eval: (c-set-style "gnu")
+ * End:
+ */
diff --git a/src/plugins/rdma/plugin.c b/src/plugins/rdma/plugin.c
new file mode 100644
index 0000000..f229b75
--- /dev/null
+++ b/src/plugins/rdma/plugin.c
@@ -0,0 +1,35 @@
+/*
+ *------------------------------------------------------------------
+ * Copyright (c) 2018 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *------------------------------------------------------------------
+ */
+
+#include <vlib/vlib.h>
+#include <vnet/plugin/plugin.h>
+#include <vpp/app/version.h>
+
+/* *INDENT-OFF* */
+VLIB_PLUGIN_REGISTER () = {
+ .version = VPP_BUILD_VER,
+ .description = "RDMA (ibverb) Device Plugin",
+};
+/* *INDENT-ON* */
+
+/*
+ * fd.io coding-style-patch-verification: ON
+ *
+ * Local Variables:
+ * eval: (c-set-style "gnu")
+ * End:
+ */
diff --git a/src/plugins/rdma/rdma.h b/src/plugins/rdma/rdma.h
new file mode 100644
index 0000000..860ddab
--- /dev/null
+++ b/src/plugins/rdma/rdma.h
@@ -0,0 +1,141 @@
+/*
+ *------------------------------------------------------------------
+ * Copyright (c) 2018 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *------------------------------------------------------------------
+ */
+
+#ifndef _RDMA_H_
+#define _RDMA_H_
+
+#include <infiniband/verbs.h>
+#include <vlib/log.h>
+
+#define foreach_rdma_device_flags \
+ _(0, INITIALIZED, "initialized") \
+ _(1, ERROR, "error") \
+ _(2, ADMIN_UP, "admin-up") \
+ _(3, VA_DMA, "vaddr-dma") \
+ _(4, LINK_UP, "link-up") \
+ _(5, SHARED_TXQ_LOCK, "shared-txq-lock") \
+ _(6, ELOG, "elog") \
+
+enum
+{
+#define _(a, b, c) RDMA_DEVICE_F_##b = (1 << a),
+ foreach_rdma_device_flags
+#undef _
+};
+
+typedef struct
+{
+ CLIB_CACHE_LINE_ALIGN_MARK (cacheline0);
+ u32 size;
+ u32 n_enq;
+ struct ibv_cq *cq;
+ struct ibv_qp *qp;
+} rdma_rxq_t;
+
+typedef struct
+{
+ CLIB_CACHE_LINE_ALIGN_MARK (cacheline0);
+ u32 size;
+ u32 n_enq;
+ struct ibv_cq *cq;
+ struct ibv_qp *qp;
+ clib_spinlock_t lock;
+} rdma_txq_t;
+
+typedef struct
+{
+ CLIB_CACHE_LINE_ALIGN_MARK (cacheline0);
+ u32 flags;
+ u32 per_interface_next_index;
+
+ u32 dev_instance;
+ u32 sw_if_index;
+ u32 hw_if_index;
+
+ u32 async_event_clib_file_index;
+
+ rdma_rxq_t *rxqs;
+ rdma_txq_t *txqs;
+
+ u8 hwaddr[6];
+ vlib_pci_addr_t pci_addr;
+
+ struct ibv_context *ctx;
+ struct ibv_pd *pd;
+ struct ibv_mr *mr;
+ struct ibv_flow *flow_ucast;
+ struct ibv_flow *flow_mcast;
+
+ /* error */
+ clib_error_t *error;
+} rdma_device_t;
+
+typedef struct
+{
+ rdma_device_t *devices;
+ vlib_log_class_t log_class;
+} rdma_main_t;
+
+extern rdma_main_t rdma_main;
+
+typedef struct
+{
+ u8 *ifname;
+
+ /* return */
+ int rv;
+ u32 sw_if_index;
+ clib_error_t *error;
+} rdma_create_if_args_t;
+
+void rdma_create_if (vlib_main_t * vm, rdma_create_if_args_t * args);
+void rdma_delete_if (vlib_main_t * vm, rdma_device_t * rd);
+
+extern vlib_node_registration_t rdma_input_node;
+extern vnet_device_class_t rdma_device_class;
+
+/* format.c */
+format_function_t format_rdma_device;
+format_function_t format_rdma_device_name;
+format_function_t format_rdma_input_trace;
+
+typedef struct
+{
+ u32 next_index;
+ u32 hw_if_index;
+} rdma_input_trace_t;
+
+#define foreach_rdma_tx_func_error \
+_(NO_FREE_SLOTS, "no free tx slots")
+
+typedef enum
+{
+#define _(f,s) RDMA_TX_ERROR_##f,
+ foreach_rdma_tx_func_error
+#undef _
+ RDMA_TX_N_ERROR,
+} rdma_tx_func_error_t;
+
+#endif /* AVF_H */
+
+/*
+ * fd.io coding-style-patch-verification: ON
+ *
+ * Local Variables:
+ * eval: (c-set-style "gnu")
+ * End:
+ */
diff --git a/src/vnet/devices/tap/tap.c b/src/vnet/devices/tap/tap.c
index 0b2ebd6..2649f68 100644
--- a/src/vnet/devices/tap/tap.c
+++ b/src/vnet/devices/tap/tap.c
@@ -347,16 +347,8 @@
}
if (!args->mac_addr_set)
- {
- f64 now = vlib_time_now (vm);
- u32 rnd;
- rnd = (u32) (now * 1e6);
- rnd = random_u32 (&rnd);
+ ethernet_mac_address_generate (args->mac_addr);
- memcpy (args->mac_addr + 2, &rnd, sizeof (rnd));
- args->mac_addr[0] = 2;
- args->mac_addr[1] = 0xfe;
- }
vif->rx_ring_sz = args->rx_ring_sz != 0 ? args->rx_ring_sz : 256;
vif->tx_ring_sz = args->tx_ring_sz != 0 ? args->tx_ring_sz : 256;
clib_memcpy (vif->mac_addr, args->mac_addr, 6);
diff --git a/src/vnet/ethernet/mac_address.h b/src/vnet/ethernet/mac_address.h
index 87a66a2..01fb76e 100644
--- a/src/vnet/ethernet/mac_address.h
+++ b/src/vnet/ethernet/mac_address.h
@@ -70,6 +70,17 @@
return ((*((u32 *) mac) == 0) && (*((u16 *) (mac + 4)) == 0));
}
+static inline void
+ethernet_mac_address_generate (u8 * mac)
+{
+ u32 rnd = clib_cpu_time_now ();
+ rnd = random_u32 (&rnd);
+
+ memcpy (mac + 2, &rnd, sizeof (rnd));
+ mac[0] = 2;
+ mac[1] = 0xfe;
+}
+
static inline int
ethernet_mac_address_equal (const u8 * a, const u8 * b)
{