vppinfra: toeplitz hash four in parallel

Type: improvement
Change-Id: Icb3f39f42d01c51d7b03543cb7d6b5dabad11866
Signed-off-by: Damjan Marion <dmarion@me.com>
diff --git a/src/vppinfra/vector/test/toeplitz.c b/src/vppinfra/vector/test/toeplitz.c
index 0efc423..d425a44 100644
--- a/src/vppinfra/vector/test/toeplitz.c
+++ b/src/vppinfra/vector/test/toeplitz.c
@@ -175,6 +175,13 @@
   return clib_toeplitz_hash (k, data, n_bytes);
 }
 
+__test_funct_fn void
+wrapper_x4 (clib_toeplitz_hash_key_t *k, u8 *d0, u8 *d1, u8 *d2, u8 *d3,
+	    u32 *h0, u32 *h1, u32 *h2, u32 *h3, u32 n_bytes)
+{
+  clib_toeplitz_hash_x4 (k, d0, d1, d2, d3, h0, h1, h2, h3, n_bytes);
+}
+
 static clib_error_t *
 test_clib_toeplitz_hash (clib_error_t *err)
 {
@@ -222,16 +229,10 @@
   n_key_copies = 6;
   bigkey_len = k->key_length * n_key_copies;
   bigdata_len = bigkey_len - 4;
-  bigkey = clib_mem_alloc (bigkey_len);
-  bigdata = clib_mem_alloc (bigdata_len);
+  bigkey = test_mem_alloc_and_splat (k->key_length, n_key_copies, k->data);
+  bigdata = test_mem_alloc_and_fill_inc_u8 (bigdata_len, 0, 0);
   u32 key_len = k->key_length;
 
-  for (int i = 0; i < n_key_copies; i++)
-    clib_memcpy (bigkey + i * key_len, k->data, key_len);
-
-  for (int i = 0; i < bigdata_len; i++)
-    bigdata[i] = (u8) i;
-
   clib_toeplitz_hash_key_free (k);
   k = clib_toeplitz_hash_key_init (bigkey, n_key_copies * key_len);
 
@@ -252,8 +253,8 @@
 
 done:
   clib_toeplitz_hash_key_free (k);
-  clib_mem_free (bigkey);
-  clib_mem_free (bigdata);
+  test_mem_free (bigkey);
+  test_mem_free (bigdata);
   return err;
 }
 
@@ -334,3 +335,225 @@
 			      .n_ops = 16384,
 			      .fn = perftest_variable_size }),
 };
+
+static clib_error_t *
+test_clib_toeplitz_hash_x4 (clib_error_t *err)
+{
+  u32 r[4];
+  int n_key_copies, bigkey_len, bigdata_len;
+  u8 *bigkey, *bigdata0, *bigdata1, *bigdata2, *bigdata3;
+  clib_toeplitz_hash_key_t *k;
+
+  k = clib_toeplitz_hash_key_init (0, 0);
+
+  wrapper_x4 (k, (u8 *) &ip4_tests[0].key, (u8 *) &ip4_tests[1].key,
+	      (u8 *) &ip4_tests[2].key, (u8 *) &ip4_tests[3].key, r, r + 1,
+	      r + 2, r + 3, 8);
+
+  if (ip4_tests[0].hash_2t != r[0] || ip4_tests[1].hash_2t != r[1] ||
+      ip4_tests[2].hash_2t != r[2] || ip4_tests[3].hash_2t != r[3])
+    return clib_error_return (err,
+			      "wrong IPv4 2 tuple x4 hash "
+			      "calculated { 0x%08x, 0x%08x, 0x%08x, 0x%08x } "
+			      "expected { 0x%08x, 0x%08x, 0x%08x, 0x%08x }",
+			      ip4_tests[0].hash_2t, ip4_tests[1].hash_2t,
+			      ip4_tests[2].hash_2t, ip4_tests[3].hash_2t, r[0],
+			      r[1], r[2], r[3]);
+
+  wrapper_x4 (k, (u8 *) &ip4_tests[0].key, (u8 *) &ip4_tests[1].key,
+	      (u8 *) &ip4_tests[2].key, (u8 *) &ip4_tests[3].key, r, r + 1,
+	      r + 2, r + 3, 12);
+
+  if (ip4_tests[0].hash_4t != r[0] || ip4_tests[1].hash_4t != r[1] ||
+      ip4_tests[2].hash_4t != r[2] || ip4_tests[3].hash_4t != r[3])
+    return clib_error_return (err,
+			      "wrong IPv4 4 tuple x4 hash "
+			      "calculated { 0x%08x, 0x%08x, 0x%08x, 0x%08x } "
+			      "expected { 0x%08x, 0x%08x, 0x%08x, 0x%08x }",
+			      ip4_tests[0].hash_4t, ip4_tests[1].hash_4t,
+			      ip4_tests[2].hash_4t, ip4_tests[3].hash_4t, r[0],
+			      r[1], r[2], r[3]);
+
+  wrapper_x4 (k, (u8 *) &ip6_tests[0].key, (u8 *) &ip6_tests[1].key,
+	      (u8 *) &ip6_tests[2].key, (u8 *) &ip6_tests[0].key, r, r + 1,
+	      r + 2, r + 3, 32);
+
+  if (ip6_tests[0].hash_2t != r[0] || ip6_tests[1].hash_2t != r[1] ||
+      ip6_tests[2].hash_2t != r[2] || ip6_tests[0].hash_2t != r[3])
+    return clib_error_return (err,
+			      "wrong IPv6 2 tuple x4 hash "
+			      "calculated { 0x%08x, 0x%08x, 0x%08x, 0x%08x } "
+			      "expected { 0x%08x, 0x%08x, 0x%08x, 0x%08x }",
+			      ip6_tests[0].hash_2t, ip6_tests[1].hash_2t,
+			      ip6_tests[2].hash_2t, ip6_tests[0].hash_2t, r[0],
+			      r[1], r[2], r[3]);
+
+  wrapper_x4 (k, (u8 *) &ip6_tests[0].key, (u8 *) &ip6_tests[1].key,
+	      (u8 *) &ip6_tests[2].key, (u8 *) &ip6_tests[0].key, r, r + 1,
+	      r + 2, r + 3, 36);
+
+  if (ip6_tests[0].hash_4t != r[0] || ip6_tests[1].hash_4t != r[1] ||
+      ip6_tests[2].hash_4t != r[2] || ip6_tests[0].hash_4t != r[3])
+    return clib_error_return (err,
+			      "wrong IPv6 4 tuple x4 hash "
+			      "calculated { 0x%08x, 0x%08x, 0x%08x, 0x%08x } "
+			      "expected { 0x%08x, 0x%08x, 0x%08x, 0x%08x }",
+			      ip6_tests[0].hash_4t, ip6_tests[1].hash_4t,
+			      ip6_tests[2].hash_4t, ip6_tests[0].hash_4t, r[0],
+			      r[1], r[2], r[3]);
+
+  n_key_copies = 6;
+  bigkey_len = k->key_length * n_key_copies;
+  bigdata_len = bigkey_len - 4;
+  bigkey = test_mem_alloc_and_splat (k->key_length, n_key_copies, k->data);
+  bigdata0 = test_mem_alloc_and_fill_inc_u8 (bigdata_len, 0, 0);
+  bigdata1 = test_mem_alloc_and_fill_inc_u8 (bigdata_len, 0, 0);
+  bigdata2 = test_mem_alloc_and_fill_inc_u8 (bigdata_len, 0, 0);
+  bigdata3 = test_mem_alloc_and_fill_inc_u8 (bigdata_len, 0, 0);
+  u32 key_len = k->key_length;
+
+  clib_toeplitz_hash_key_free (k);
+  k = clib_toeplitz_hash_key_init (bigkey, n_key_copies * key_len);
+
+  for (int i = 0; i < N_LENGTH_TESTS - 4; i++)
+    {
+      wrapper_x4 (k, bigdata0, bigdata1, bigdata2, bigdata3, r, r + 1, r + 2,
+		  r + 3, i);
+      if (length_test_hashes[i] != r[0] || length_test_hashes[i] != r[1] ||
+	  length_test_hashes[i] != r[2] || length_test_hashes[i] != r[3])
+	{
+	  err = clib_error_return (
+	    err,
+	    "wrong length test hash x4 for length %u, "
+	    "calculated { 0x%08x, 0x%08x, 0x%08x, 0x%08x }, expected 0x%08x",
+	    i, r[0], r[1], r[2], r[3], length_test_hashes[i]);
+	  goto done;
+	}
+    }
+
+done:
+  clib_toeplitz_hash_key_free (k);
+  test_mem_free (bigkey);
+  test_mem_free (bigdata0);
+  test_mem_free (bigdata1);
+  test_mem_free (bigdata2);
+  test_mem_free (bigdata3);
+  return err;
+}
+
+void __test_perf_fn
+perftest_fixed_12byte_x4 (int fd, test_perf_t *tp)
+{
+  u32 n = tp->n_ops / 4;
+  u8 *d0 = test_mem_alloc_and_splat (12, n, (void *) &ip4_tests[0].key);
+  u8 *d1 = test_mem_alloc_and_splat (12, n, (void *) &ip4_tests[1].key);
+  u8 *d2 = test_mem_alloc_and_splat (12, n, (void *) &ip4_tests[2].key);
+  u8 *d3 = test_mem_alloc_and_splat (12, n, (void *) &ip4_tests[3].key);
+  u32 *h0 = test_mem_alloc (4 * n);
+  u32 *h1 = test_mem_alloc (4 * n);
+  u32 *h2 = test_mem_alloc (4 * n);
+  u32 *h3 = test_mem_alloc (4 * n);
+  clib_toeplitz_hash_key_t *k = clib_toeplitz_hash_key_init (0, 0);
+
+  test_perf_event_enable (fd);
+  for (int i = 0; i < n; i++)
+    clib_toeplitz_hash_x4 (k, d0 + i * 12, d1 + i * 12, d2 + i * 12,
+			   d3 + i * 12, h0 + i, h1 + i, h2 + i, h3 + i, 12);
+  test_perf_event_disable (fd);
+
+  clib_toeplitz_hash_key_free (k);
+  test_mem_free (d0);
+  test_mem_free (d1);
+  test_mem_free (d2);
+  test_mem_free (d3);
+  test_mem_free (h0);
+  test_mem_free (h1);
+  test_mem_free (h2);
+  test_mem_free (h3);
+}
+
+void __test_perf_fn
+perftest_fixed_36byte_x4 (int fd, test_perf_t *tp)
+{
+  u32 n = tp->n_ops / 4;
+  u8 *d0 = test_mem_alloc_and_splat (36, n, (void *) &ip4_tests[0].key);
+  u8 *d1 = test_mem_alloc_and_splat (36, n, (void *) &ip4_tests[1].key);
+  u8 *d2 = test_mem_alloc_and_splat (36, n, (void *) &ip4_tests[2].key);
+  u8 *d3 = test_mem_alloc_and_splat (36, n, (void *) &ip4_tests[3].key);
+  u32 *h0 = test_mem_alloc (4 * n);
+  u32 *h1 = test_mem_alloc (4 * n);
+  u32 *h2 = test_mem_alloc (4 * n);
+  u32 *h3 = test_mem_alloc (4 * n);
+  clib_toeplitz_hash_key_t *k = clib_toeplitz_hash_key_init (0, 0);
+
+  test_perf_event_enable (fd);
+  for (int i = 0; i < n; i++)
+    clib_toeplitz_hash_x4 (k, d0 + i * 36, d1 + i * 36, d2 + i * 36,
+			   d3 + i * 36, h0 + i, h1 + i, h2 + i, h3 + i, 36);
+  test_perf_event_disable (fd);
+
+  clib_toeplitz_hash_key_free (k);
+  test_mem_free (d0);
+  test_mem_free (d1);
+  test_mem_free (d2);
+  test_mem_free (d3);
+  test_mem_free (h0);
+  test_mem_free (h1);
+  test_mem_free (h2);
+  test_mem_free (h3);
+}
+
+void __test_perf_fn
+perftest_variable_size_x4 (int fd, test_perf_t *tp)
+{
+  u32 key_len, n_keys, n = tp->n_ops / 4;
+  u8 *key;
+  u8 *d0 = test_mem_alloc (n);
+  u8 *d1 = test_mem_alloc (n);
+  u8 *d2 = test_mem_alloc (n);
+  u8 *d3 = test_mem_alloc (n);
+  u32 *h0 = test_mem_alloc (sizeof (u32));
+  u32 *h1 = test_mem_alloc (sizeof (u32));
+  u32 *h2 = test_mem_alloc (sizeof (u32));
+  u32 *h3 = test_mem_alloc (sizeof (u32));
+  clib_toeplitz_hash_key_t *k = clib_toeplitz_hash_key_init (0, 0);
+
+  k = clib_toeplitz_hash_key_init (0, 0);
+  key_len = k->key_length;
+  n_keys = ((n + 4) / k->key_length) + 1;
+  key = test_mem_alloc_and_splat (n_keys, key_len, k->data);
+  clib_toeplitz_hash_key_free (k);
+  k = clib_toeplitz_hash_key_init (key, key_len * n_keys);
+
+  test_perf_event_enable (fd);
+  clib_toeplitz_hash_x4 (k, d0, d1, d2, d3, h0, h1, h2, h3, n);
+  test_perf_event_disable (fd);
+
+  clib_toeplitz_hash_key_free (k);
+  test_mem_free (key);
+  test_mem_free (d0);
+  test_mem_free (d1);
+  test_mem_free (d2);
+  test_mem_free (d3);
+  test_mem_free (h0);
+  test_mem_free (h1);
+  test_mem_free (h2);
+  test_mem_free (h3);
+}
+
+REGISTER_TEST (clib_toeplitz_hash_x4) = {
+  .name = "clib_toeplitz_hash_x4",
+  .fn = test_clib_toeplitz_hash_x4,
+  .perf_tests = PERF_TESTS ({ .name = "fixed_12",
+			      .op_name = "12B Tuple",
+			      .n_ops = 1024,
+			      .fn = perftest_fixed_12byte_x4 },
+			    { .name = "fixed_36",
+			      .op_name = "36B Tuple",
+			      .n_ops = 1024,
+			      .fn = perftest_fixed_36byte_x4 },
+			    { .name = "variable_size",
+			      .op_name = "Byte",
+			      .n_ops = 16384,
+			      .fn = perftest_variable_size_x4 }),
+};