vppinfra: add perf testing to test_vector_func

Type: improvement
Change-Id: I7aacd58d113c13036c15655817400032dd8d1932
Signed-off-by: Damjan Marion <damarion@cisco.com>
diff --git a/src/vppinfra/vector/test/array_mask.c b/src/vppinfra/vector/test/array_mask.c
index a1f4da7..904bb21 100644
--- a/src/vppinfra/vector/test/array_mask.c
+++ b/src/vppinfra/vector/test/array_mask.c
@@ -6,7 +6,7 @@
 #include <vppinfra/vector/test/test.h>
 #include <vppinfra/vector/array_mask.h>
 
-__clib_test_fn void
+__test_funct_fn void
 clib_array_mask_u32_wrapper (u32 *src, u32 mask, u32 n_elts)
 {
   clib_array_mask_u32 (src, mask, n_elts);
diff --git a/src/vppinfra/vector/test/compress.c b/src/vppinfra/vector/test/compress.c
index 9bc53ff..4f3fd53 100644
--- a/src/vppinfra/vector/test/compress.c
+++ b/src/vppinfra/vector/test/compress.c
@@ -6,25 +6,25 @@
 #include <vppinfra/vector/test/test.h>
 #include <vppinfra/vector/compress.h>
 
-__clib_test_fn u32
+__test_funct_fn u32
 clib_compress_u64_wrapper (u64 *dst, u64 *src, u64 *mask, u32 n_elts)
 {
   return clib_compress_u64 (dst, src, mask, n_elts);
 }
 
-__clib_test_fn u32
+__test_funct_fn u32
 clib_compress_u32_wrapper (u32 *dst, u32 *src, u64 *mask, u32 n_elts)
 {
   return clib_compress_u32 (dst, src, mask, n_elts);
 }
 
-__clib_test_fn u32
+__test_funct_fn u32
 clib_compress_u16_wrapper (u16 *dst, u16 *src, u64 *mask, u32 n_elts)
 {
   return clib_compress_u16 (dst, src, mask, n_elts);
 }
 
-__clib_test_fn u32
+__test_funct_fn u32
 clib_compress_u8_wrapper (u8 *dst, u8 *src, u64 *mask, u32 n_elts)
 {
   return clib_compress_u8 (dst, src, mask, n_elts);
diff --git a/src/vppinfra/vector/test/count_equal.c b/src/vppinfra/vector/test/count_equal.c
index 1ca9735..c57b027 100644
--- a/src/vppinfra/vector/test/count_equal.c
+++ b/src/vppinfra/vector/test/count_equal.c
@@ -9,8 +9,8 @@
 #define foreach_clib_count_equal(type)                                        \
   typedef uword (wrapper_fn_##type) (type * a, uword maxcount);               \
                                                                               \
-  __clib_test_fn uword clib_count_equal_##type##_wrapper (type *a,            \
-							  uword maxcount)     \
+  __test_funct_fn uword clib_count_equal_##type##_wrapper (type *a,           \
+							   uword maxcount)    \
   {                                                                           \
     return clib_count_equal_##type (a, maxcount);                             \
   }                                                                           \
diff --git a/src/vppinfra/vector/test/index_to_ptr.c b/src/vppinfra/vector/test/index_to_ptr.c
index ae33020..7ee3b94 100644
--- a/src/vppinfra/vector/test/index_to_ptr.c
+++ b/src/vppinfra/vector/test/index_to_ptr.c
@@ -9,7 +9,7 @@
 typedef void (wrapper_fn) (u32 *indices, void *base, u8 shift, void **ptrs,
 			   u32 n_elts);
 
-__clib_test_fn void
+__test_funct_fn void
 clib_index_to_ptr_u32_wrapper (u32 *indices, void *base, u8 shift, void **ptrs,
 			       u32 n_elts)
 {
diff --git a/src/vppinfra/vector/test/ip_csum.c b/src/vppinfra/vector/test/ip_csum.c
index 135d5ae..cb33c03 100644
--- a/src/vppinfra/vector/test/ip_csum.c
+++ b/src/vppinfra/vector/test/ip_csum.c
@@ -80,7 +80,7 @@
 test_clib_ip_csum (clib_error_t *err)
 {
   u8 *buf;
-  buf = clib_mem_alloc_aligned (65536, CLIB_CACHE_LINE_BYTES);
+  buf = test_mem_alloc (65536);
   for (int i = 0; i < 65536; i++)
     buf[i] = 0xf0 + ((i * 7) & 0xf);
 
@@ -110,11 +110,72 @@
 	}
     }
 done:
-  clib_mem_free (buf);
+  test_mem_free (buf);
   return err;
 }
 
+void __test_perf_fn
+perftest_ip4_hdr (int fd, test_perf_t *tp)
+{
+  u32 n = tp->n_ops;
+  u8 *data = test_mem_alloc_and_splat (20, n, (void *) &test1);
+  u16 *res = test_mem_alloc (n * sizeof (u16));
+
+  test_perf_event_enable (fd);
+  for (int i = 0; i < n; i++)
+    res[i] = clib_ip_csum (data + i * 20, 20);
+  test_perf_event_disable (fd);
+
+  test_mem_free (data);
+  test_mem_free (res);
+}
+
+void __test_perf_fn
+perftest_tcp_payload (int fd, test_perf_t *tp)
+{
+  u32 n = tp->n_ops;
+  volatile uword *lenp = &tp->arg0;
+  u8 *data = test_mem_alloc_and_splat (20, n, (void *) &test1);
+  u16 *res = test_mem_alloc (n * sizeof (u16));
+
+  test_perf_event_enable (fd);
+  for (int i = 0; i < n; i++)
+    res[i] = clib_ip_csum (data + i * lenp[0], lenp[0]);
+  test_perf_event_disable (fd);
+
+  test_mem_free (data);
+  test_mem_free (res);
+}
+
+void __test_perf_fn
+perftest_byte (int fd, test_perf_t *tp)
+{
+  volatile uword *np = &tp->n_ops;
+  u8 *data = test_mem_alloc_and_fill_inc_u8 (*np, 0, 0);
+  u16 *res = test_mem_alloc (sizeof (u16));
+
+  test_perf_event_enable (fd);
+  res[0] = clib_ip_csum (data, np[0]);
+  test_perf_event_disable (fd);
+
+  test_mem_free (data);
+  test_mem_free (res);
+}
+
 REGISTER_TEST (clib_ip_csum) = {
   .name = "clib_ip_csum",
   .fn = test_clib_ip_csum,
+  .perf_tests = PERF_TESTS (
+    { .name = "ip4_hdr",
+      .op_name = "IP4Hdr",
+      .n_ops = 1024,
+      .fn = perftest_ip4_hdr },
+    { .name = "tcp_paylaad",
+      .op_name = "1460Byte",
+      .n_ops = 16,
+      .arg0 = 1460,
+      .fn = perftest_tcp_payload },
+    { .name = "byte", .op_name = "Byte", .n_ops = 16384, .fn = perftest_byte }
+
+    ),
 };
diff --git a/src/vppinfra/vector/test/mask_compare.c b/src/vppinfra/vector/test/mask_compare.c
index 64df0ee..40cac1b 100644
--- a/src/vppinfra/vector/test/mask_compare.c
+++ b/src/vppinfra/vector/test/mask_compare.c
@@ -6,13 +6,13 @@
 #include <vppinfra/vector/test/test.h>
 #include <vppinfra/vector/mask_compare.h>
 
-__clib_test_fn void
+__test_funct_fn void
 clib_mask_compare_u16_wrapper (u16 v, u16 *a, u64 *mask, u32 n_elts)
 {
   clib_mask_compare_u16 (v, a, mask, n_elts);
 }
 
-__clib_test_fn void
+__test_funct_fn void
 clib_mask_compare_u32_wrapper (u32 v, u32 *a, u64 *mask, u32 n_elts)
 {
   clib_mask_compare_u32 (v, a, mask, n_elts);
diff --git a/src/vppinfra/vector/test/memcpy_x86_64.c b/src/vppinfra/vector/test/memcpy_x86_64.c
index 78aab18..edb32d0 100644
--- a/src/vppinfra/vector/test/memcpy_x86_64.c
+++ b/src/vppinfra/vector/test/memcpy_x86_64.c
@@ -8,7 +8,7 @@
 #include <vppinfra/vector/test/test.h>
 #include <vppinfra/vector/mask_compare.h>
 
-__clib_test_fn void
+__test_funct_fn void
 wrapper (u8 *dst, u8 *src, uword n)
 {
   clib_memcpy_x86_64 (dst, src, n);
diff --git a/src/vppinfra/vector/test/test.c b/src/vppinfra/vector/test/test.c
index 1a8b9d6..d098766 100644
--- a/src/vppinfra/vector/test/test.c
+++ b/src/vppinfra/vector/test/test.c
@@ -4,8 +4,9 @@
 
 #include <vppinfra/format.h>
 #include <vppinfra/vector/test/test.h>
+#include <vppinfra/error.h>
 
-test_registration_t *test_registrations[CLIB_MARCH_TYPE_N_VARIANTS] = {};
+test_main_t test_main;
 
 int
 test_march_supported (clib_march_variant_type_t type)
@@ -18,14 +19,12 @@
     return 0;
 }
 
-int
-main (int argc, char *argv[])
+clib_error_t *
+test_funct (test_main_t *tm)
 {
-  clib_mem_init (0, 64ULL << 20);
-
   for (int i = 0; i < CLIB_MARCH_TYPE_N_VARIANTS; i++)
     {
-      test_registration_t *r = test_registrations[i];
+      test_registration_t *r = tm->registrations[i];
 
       if (r == 0 || test_march_supported (i) < 0)
 	continue;
@@ -51,3 +50,241 @@
   fformat (stdout, "\n");
   return 0;
 }
+
+#define TEST_PERF_MAX_EVENTS 7
+typedef struct
+{
+  u64 config[TEST_PERF_MAX_EVENTS];
+  u8 n_events;
+  format_function_t *format_fn;
+} test_perf_event_bundle_t;
+
+static u8 *
+format_test_perf_bundle_default (u8 *s, va_list *args)
+{
+  test_perf_event_bundle_t __clib_unused *b =
+    va_arg (*args, test_perf_event_bundle_t *);
+  test_perf_t *tp = va_arg (*args, test_perf_t *);
+  u64 *data = va_arg (*args, u64 *);
+
+  if (data)
+    s = format (s, "%5.2f", (f64) data[1] / data[0]);
+  else
+    s = format (s, "%5s", "IPC");
+
+  if (data)
+    s = format (s, "%8.2f", (f64) data[0] / tp->n_ops);
+  else
+    s = format (s, "%8s", "Clks/Op");
+
+  if (data)
+    s = format (s, "%8.2f", (f64) data[1] / tp->n_ops);
+  else
+    s = format (s, "%8s", "Inst/Op");
+
+  if (data)
+    s = format (s, "%9.2f", (f64) data[2] / tp->n_ops);
+  else
+    s = format (s, "%9s", "Brnch/Op");
+
+  if (data)
+    s = format (s, "%10.2f", (f64) data[3] / tp->n_ops);
+  else
+    s = format (s, "%10s", "BrMiss/Op");
+  return s;
+}
+
+test_perf_event_bundle_t perf_bundles[] = { {
+  .config[0] = PERF_COUNT_HW_CPU_CYCLES,
+  .config[1] = PERF_COUNT_HW_INSTRUCTIONS,
+  .config[2] = PERF_COUNT_HW_BRANCH_INSTRUCTIONS,
+  .config[3] = PERF_COUNT_HW_BRANCH_MISSES,
+  .n_events = 4,
+  .format_fn = format_test_perf_bundle_default,
+} };
+
+#ifdef __linux__
+clib_error_t *
+test_perf (test_main_t *tm)
+{
+  clib_error_t *err = 0;
+  test_perf_event_bundle_t *b = perf_bundles;
+  int group_fd = -1, fds[TEST_PERF_MAX_EVENTS];
+  u64 count[TEST_PERF_MAX_EVENTS + 3] = {};
+  struct perf_event_attr pe = {
+    .size = sizeof (struct perf_event_attr),
+    .type = PERF_TYPE_HARDWARE,
+    .disabled = 1,
+    .exclude_kernel = 1,
+    .exclude_hv = 1,
+    .pinned = 1,
+    .exclusive = 1,
+    .read_format = (PERF_FORMAT_GROUP | PERF_FORMAT_TOTAL_TIME_ENABLED |
+		    PERF_FORMAT_TOTAL_TIME_RUNNING),
+  };
+
+  for (int i = 0; i < TEST_PERF_MAX_EVENTS; i++)
+    fds[i] = -1;
+
+  for (int i = 0; i < b->n_events; i++)
+    {
+      pe.config = b->config[i];
+      int fd = syscall (__NR_perf_event_open, &pe, /* pid */ 0, /* cpu */ -1,
+			/* group_fd */ group_fd, /* flags */ 0);
+      if (fd < 0)
+	{
+	  err = clib_error_return_unix (0, "perf_event_open");
+	  goto done;
+	}
+
+      if (group_fd == -1)
+	{
+	  group_fd = fd;
+	  pe.pinned = 0;
+	  pe.exclusive = 0;
+	}
+      fds[i] = fd;
+    }
+
+  for (int i = 0; i < CLIB_MARCH_TYPE_N_VARIANTS; i++)
+    {
+      test_registration_t *r = tm->registrations[i];
+
+      if (r == 0 || test_march_supported (i) < 0)
+	continue;
+
+      fformat (stdout, "\nMultiarch Variant: %U\n", format_march_variant, i);
+      fformat (stdout,
+	       "-------------------------------------------------------\n");
+      while (r)
+	{
+	  if (r->perf_tests)
+	    {
+	      test_perf_t *pt = r->perf_tests;
+	      fformat (stdout, "%-22s%-12s%U\n", r->name, "OpType",
+		       b->format_fn, b, pt, 0UL);
+	      do
+		{
+		  u32 read_size = (b->n_events + 3) * sizeof (u64);
+		  for (int i = 0; i < tm->repeat; i++)
+		    {
+		      test_perf_event_reset (group_fd);
+		      pt->fn (group_fd, pt);
+		      if ((read (group_fd, &count, read_size) != read_size))
+			{
+			  err = clib_error_return_unix (0, "read");
+			  goto done;
+			}
+		      if (count[1] != count[2])
+			clib_warning (
+			  "perf counters were not running all the time."
+#ifdef __x86_64__
+			  "\nConsider turning NMI watchdog off ('sysctl -w "
+			  "kernel.nmi_watchdog=0')."
+#endif
+			);
+		      fformat (stdout, "  %-20s%-12s%U\n", pt->name,
+			       pt->op_name ? pt->op_name : "", b->format_fn, b,
+			       pt, count + 3);
+		    }
+		}
+	      while ((++pt)->fn);
+	    }
+	  r = r->next;
+	}
+    }
+
+done:
+  for (int i = 0; i < TEST_PERF_MAX_EVENTS; i++)
+    if (fds[i] != -1)
+      close (fds[i]);
+  return err;
+}
+#endif
+
+int
+main (int argc, char *argv[])
+{
+  test_main_t *tm = &test_main;
+  unformat_input_t _i = {}, *i = &_i;
+  clib_mem_init (0, 64ULL << 20);
+  clib_error_t *err;
+  int perf = 0;
+
+  /* defaults */
+  tm->repeat = 3;
+
+  unformat_init_command_line (i, argv);
+
+  while (unformat_check_input (i) != UNFORMAT_END_OF_INPUT)
+    {
+      if (unformat (i, "perf"))
+	perf = 1;
+      else if (unformat (i, "repeat %d", &tm->repeat))
+	;
+      else
+	{
+	  clib_warning ("unknown input '%U'", format_unformat_error, i);
+	  exit (1);
+	}
+    }
+
+  if (perf)
+    err = test_perf (tm);
+  else
+    err = test_funct (tm);
+
+  if (err)
+    {
+      clib_error_report (err);
+      fformat (stderr, "\n");
+      return 1;
+    }
+  return 0;
+}
+
+void *
+test_mem_alloc (uword size)
+{
+  void *rv;
+  size = round_pow2 (size, CLIB_CACHE_LINE_BYTES);
+  rv = clib_mem_alloc_aligned (size, CLIB_CACHE_LINE_BYTES);
+  clib_memset_u8 (rv, 0, size);
+  return rv;
+}
+
+void *
+test_mem_alloc_and_fill_inc_u8 (uword size, u8 start, u8 mask)
+{
+  u8 *rv;
+  mask = mask ? mask : 0xff;
+  size = round_pow2 (size, CLIB_CACHE_LINE_BYTES);
+  rv = clib_mem_alloc_aligned (size, CLIB_CACHE_LINE_BYTES);
+  for (uword i = 0; i < size; i++)
+    rv[i] = ((u8) i + start) & mask;
+  return rv;
+}
+
+void *
+test_mem_alloc_and_splat (uword elt_size, uword n_elts, void *elt)
+{
+  u8 *rv, *e;
+  uword data_size = elt_size * n_elts;
+  uword alloc_size = round_pow2 (data_size, CLIB_CACHE_LINE_BYTES);
+  e = rv = clib_mem_alloc_aligned (alloc_size, CLIB_CACHE_LINE_BYTES);
+  while (e - rv < data_size)
+    {
+      clib_memcpy_fast (e, elt, elt_size);
+      e += elt_size;
+    }
+
+  if (data_size < alloc_size)
+    clib_memset_u8 (e, 0, alloc_size - data_size);
+  return rv;
+}
+
+void
+test_mem_free (void *p)
+{
+  clib_mem_free (p);
+}
diff --git a/src/vppinfra/vector/test/test.h b/src/vppinfra/vector/test/test.h
index bc499fb..02169c1 100644
--- a/src/vppinfra/vector/test/test.h
+++ b/src/vppinfra/vector/test/test.h
@@ -6,20 +6,44 @@
 #define included_test_test_h
 
 #include <vppinfra/cpu.h>
+#ifdef __linux__
+#include <sys/ioctl.h>
+#include <linux/perf_event.h>
+#endif
 
 typedef clib_error_t *(test_fn_t) (clib_error_t *);
 
+struct test_perf_;
+typedef void (test_perf_fn_t) (int fd, struct test_perf_ *tp);
+
+typedef struct test_perf_
+{
+  u64 n_ops;
+  u64 arg0;
+  char *op_name;
+  char *name;
+  test_perf_fn_t *fn;
+} test_perf_t;
+
 typedef struct test_registration_
 {
   char *name;
   u8 multiarch : 1;
   test_fn_t *fn;
+  test_perf_t *perf_tests;
+  u32 n_perf_tests;
   struct test_registration_ *next;
 } test_registration_t;
 
-extern test_registration_t *test_registrations[CLIB_MARCH_TYPE_N_VARIANTS];
+typedef struct
+{
+  test_registration_t *registrations[CLIB_MARCH_TYPE_N_VARIANTS];
+  u32 repeat;
+} test_main_t;
+extern test_main_t test_main;
 
-#define __clib_test_fn static __clib_noinline __clib_section (".test_wrapper")
+#define __test_funct_fn static __clib_noinline __clib_section (".test_func")
+#define __test_perf_fn	static __clib_noinline __clib_section (".test_perf")
 
 #define REGISTER_TEST(x)                                                      \
   test_registration_t CLIB_MARCH_SFX (__test_##x);                            \
@@ -27,9 +51,50 @@
     void)                                                                     \
   {                                                                           \
     test_registration_t *r = &CLIB_MARCH_SFX (__test_##x);                    \
-    r->next = test_registrations[CLIB_MARCH_SFX (CLIB_MARCH_VARIANT_TYPE)];   \
-    test_registrations[CLIB_MARCH_SFX (CLIB_MARCH_VARIANT_TYPE)] = r;         \
+    r->next =                                                                 \
+      test_main.registrations[CLIB_MARCH_SFX (CLIB_MARCH_VARIANT_TYPE)];      \
+    test_main.registrations[CLIB_MARCH_SFX (CLIB_MARCH_VARIANT_TYPE)] = r;    \
   }                                                                           \
   test_registration_t CLIB_MARCH_SFX (__test_##x)
 
+#define PERF_TESTS(...)                                                       \
+  (test_perf_t[])                                                             \
+  {                                                                           \
+    __VA_ARGS__, {}                                                           \
+  }
+
+static_always_inline void
+test_perf_event_ioctl (int fd, u32 req)
+{
+#ifdef __x86_64__
+  asm inline("syscall"
+	     :
+	     : "D"(fd), "S"(req), "a"(__NR_ioctl), "d"(PERF_IOC_FLAG_GROUP)
+	     : "rcx", "r11" /* registers modified by kernel */);
+#else
+  ioctl (fd, req, PERF_IOC_FLAG_GROUP);
+#endif
+}
+
+static_always_inline void
+test_perf_event_reset (int fd)
+{
+  test_perf_event_ioctl (fd, PERF_EVENT_IOC_RESET);
+}
+static_always_inline void
+test_perf_event_enable (int fd)
+{
+  test_perf_event_ioctl (fd, PERF_EVENT_IOC_ENABLE);
+}
+static_always_inline void
+test_perf_event_disable (int fd)
+{
+  test_perf_event_ioctl (fd, PERF_EVENT_IOC_DISABLE);
+}
+
+void *test_mem_alloc (uword size);
+void *test_mem_alloc_and_fill_inc_u8 (uword size, u8 start, u8 mask);
+void *test_mem_alloc_and_splat (uword elt_size, uword n_elts, void *elt);
+void test_mem_free (void *p);
+
 #endif