quic: quicly crypto offloading

- Implement our own quic packet allocator to allocate more memory at the end of the
packet to store crypto offloading related data
- 1RTT packets offloading encryption/decryption using vnet crypto
- Add cli to change max packet per key

Type: feature

Change-Id: I7557fd457d7ba492329d5d8ed192509cbd727f9c
Signed-off-by: MathiasRaoul <mathias.raoul@gmail.com>
diff --git a/src/plugins/quic/quic.c b/src/plugins/quic/quic.c
index 7780c14..5265e5b 100644
--- a/src/plugins/quic/quic.c
+++ b/src/plugins/quic/quic.c
@@ -26,10 +26,14 @@
 #include <quic/quic.h>
 #include <quic/certs.h>
 #include <quic/error.h>
-#include <quic/quic_crypto.h>
 
 #include <quicly/constants.h>
 #include <quicly/defaults.h>
+#include <picotls.h>
+
+#include <quic/quic_crypto.h>
+
+extern quicly_crypto_engine_t quic_crypto_engine;
 
 static char *quic_error_strings[] = {
 #define quic_error(n,s) s,
@@ -37,7 +41,9 @@
 #undef quic_error
 };
 
-static quic_main_t quic_main;
+#define DEFAULT_MAX_PACKETS_PER_KEY 16777216
+
+quic_main_t quic_main;
 static void quic_update_timer (quic_ctx_t * ctx);
 static void quic_check_quic_session_connected (quic_ctx_t * ctx);
 static int quic_reset_connection (u64 udp_session_handle,
@@ -85,6 +91,33 @@
   pool_put (qm->wrk_ctx[thread_index].crypto_ctx_pool, crctx);
 }
 
+static quicly_datagram_t *
+quic_alloc_packet (quicly_packet_allocator_t * self, size_t payloadsize)
+{
+  quicly_datagram_t *packet;
+  if ((packet =
+       clib_mem_alloc (sizeof (*packet) + payloadsize +
+		       sizeof (quic_encrypt_cb_ctx))) == NULL)
+    return NULL;
+  packet->data.base =
+    (uint8_t *) packet + sizeof (*packet) + sizeof (quic_encrypt_cb_ctx);
+  quic_encrypt_cb_ctx *encrypt_cb_ctx =
+    (quic_encrypt_cb_ctx *) ((uint8_t *) packet + sizeof (*packet));
+
+  clib_memset (encrypt_cb_ctx, 0, sizeof (*encrypt_cb_ctx));
+  return packet;
+}
+
+static void
+quic_free_packet (quicly_packet_allocator_t * self,
+		  quicly_datagram_t * packet)
+{
+  clib_mem_free (packet);
+}
+
+quicly_packet_allocator_t quic_packet_allocator =
+  { quic_alloc_packet, quic_free_packet };
+
 static int
 quic_app_cert_key_pair_delete_callback (app_cert_key_pair_t * ckpair)
 {
@@ -154,6 +187,32 @@
   return 0;
 }
 
+static clib_error_t *
+quic_set_max_packets_per_key_fn (vlib_main_t * vm,
+				 unformat_input_t * input,
+				 vlib_cli_command_t * cmd)
+{
+  quic_main_t *qm = &quic_main;
+  unformat_input_t _line_input, *line_input = &_line_input;
+  u64 tmp;
+
+  if (!unformat_user (input, unformat_line_input, line_input))
+    return 0;
+
+  while (unformat_check_input (line_input) != UNFORMAT_END_OF_INPUT)
+    {
+      if (unformat (line_input, "%U", unformat_memory_size, &tmp))
+	{
+	  qm->max_packets_per_key = tmp;
+	}
+      else
+	return clib_error_return (0, "unknown input '%U'",
+				  format_unformat_error, line_input);
+    }
+
+  return 0;
+}
+
 static void
 quic_release_crypto_context (u32 crypto_context_index, u8 thread_index)
 {
@@ -203,12 +262,15 @@
   clib_memcpy (quicly_ctx, &quicly_spec_context, sizeof (quicly_context_t));
 
   quicly_ctx->max_packet_size = QUIC_MAX_PACKET_SIZE;
+  quicly_ctx->max_packets_per_key = qm->max_packets_per_key;
   quicly_ctx->tls = ptls_ctx;
   quicly_ctx->stream_open = &on_stream_open;
   quicly_ctx->closed_by_peer = &on_closed_by_peer;
   quicly_ctx->now = &quicly_vpp_now_cb;
   quicly_amend_ptls_context (quicly_ctx->tls);
 
+  quicly_ctx->packet_allocator = &quic_packet_allocator;
+  quicly_ctx->crypto_engine = &quic_crypto_engine;
   quicly_ctx->transport_params.max_data = QUIC_INT_MAX;
   quicly_ctx->transport_params.max_streams_uni = (uint64_t) 1 << 60;
   quicly_ctx->transport_params.max_streams_bidi = (uint64_t) 1 << 60;
@@ -350,7 +412,7 @@
   return pool_elt_at_index (quic_main.ctx_pool[thread_index], ctx_index);
 }
 
-static quic_ctx_t *
+quic_ctx_t *
 quic_get_conn_ctx (quicly_conn_t * conn)
 {
   u64 conn_data;
@@ -633,12 +695,14 @@
 static int
 quic_send_packets (quic_ctx_t * ctx)
 {
+  quic_main_t *qm = &quic_main;
   quicly_datagram_t *packets[QUIC_SEND_PACKET_VEC_SIZE];
   session_t *udp_session;
   quicly_conn_t *conn;
   size_t num_packets, i, max_packets;
   quicly_packet_allocator_t *pa;
   int err = 0;
+  u32 thread_index = vlib_get_thread_index ();
 
   /* We have sctx, get qctx */
   if (quic_ctx_is_stream (ctx))
@@ -669,8 +733,12 @@
       if ((err = quicly_send (conn, packets, &num_packets)))
 	goto quicly_error;
 
+      quic_crypto_batch_tx_packets (&qm->wrk_ctx
+				    [thread_index].crypto_context_batch);
+
       for (i = 0; i != num_packets; ++i)
 	{
+	  quic_crypto_finalize_send_packet (packets[i]);
 	  if ((err = quic_send_datagram (udp_session, packets[i])))
 	    goto quicly_error;
 
@@ -1122,7 +1190,6 @@
 }
 
 /* Transport proto functions */
-
 static int
 quic_connect_stream (session_t * quic_session, session_endpoint_cfg_t * sep)
 {
@@ -1990,7 +2057,7 @@
   ctx = quic_ctx_get (pctx->ctx_index, pctx->thread_index);
   if (ctx->c_s_index != QUIC_SESSION_INVALID)
     {
-      QUIC_DBG (2, "already accepted ctx 0x%x", ctx_index);
+      QUIC_DBG (2, "already accepted ctx 0x%x", ctx->c_s_index);
       return;
     }
 
@@ -2128,6 +2195,8 @@
   if (rv == QUIC_PACKET_TYPE_RECEIVE)
     {
       pctx->ptype = QUIC_PACKET_TYPE_RECEIVE;
+      quic_ctx_t *qctx = quic_ctx_get (pctx->ctx_index, thread_index);
+      quic_crypto_decrypt_packet (qctx, pctx);
       return 0;
     }
   else if (rv == QUIC_PACKET_TYPE_MIGRATE)
@@ -2153,6 +2222,7 @@
 quic_udp_session_rx_callback (session_t * udp_session)
 {
   /*  Read data from UDP rx_fifo and pass it to the quicly conn. */
+  quic_main_t *qm = &quic_main;
   quic_ctx_t *ctx = NULL, *prev_ctx = NULL;
   svm_fifo_t *f = udp_session->rx_fifo;
   u32 max_deq;
@@ -2212,6 +2282,9 @@
 	}
     }
 
+  quic_crypto_batch_rx_packets (&qm->
+				wrk_ctx[thread_index].crypto_context_batch);
+
   for (i = 0; i < max_packets; i++)
     {
       switch (packets_ctx[i].ptype)
@@ -2412,6 +2485,7 @@
 
   vec_validate (qm->ctx_pool, num_threads - 1);
   vec_validate (qm->wrk_ctx, num_threads - 1);
+
   for (i = 0; i < num_threads; i++)
     {
       qm->wrk_ctx[i].next_cid.thread_id = i;
@@ -2421,6 +2495,9 @@
       tw->last_run_time = vlib_time_now (vlib_get_main ());
       clib_bihash_init_24_8 (&qm->wrk_ctx[i].crypto_context_hash,
 			     "quic crypto contexts", 64, 128 << 10);
+
+      qm->wrk_ctx[i].crypto_context_batch.nb_rx_packets = 0;
+      qm->wrk_ctx[i].crypto_context_batch.nb_tx_packets = 0;
     }
 
   clib_bihash_init_16_8 (&qm->connection_hash, "quic connections", 1024,
@@ -2441,7 +2518,9 @@
   quic_register_cipher_suite (CRYPTO_ENGINE_VPP, quic_crypto_cipher_suites);
   quic_register_cipher_suite (CRYPTO_ENGINE_PICOTLS,
 			      ptls_openssl_cipher_suites);
-  qm->default_crypto_engine = CRYPTO_ENGINE_PICOTLS;
+  qm->default_crypto_engine = CRYPTO_ENGINE_VPP;
+  qm->max_packets_per_key = DEFAULT_MAX_PACKETS_PER_KEY;
+  clib_rwlock_init (&qm->crypto_keys_quic_rw_lock);
   vec_free (a->name);
   return 0;
 }
@@ -2760,6 +2839,12 @@
   .short_help = "list quic crypto contextes",
   .function = quic_list_crypto_context_command_fn,
 };
+VLIB_CLI_COMMAND (quic_set_max_packets_per_key, static) =
+{
+  .path = "set quic max_packets_per_key",
+  .short_help = "set quic max_packets_per_key 16777216",
+  .function = quic_set_max_packets_per_key_fn,
+};
 VLIB_PLUGIN_REGISTER () =
 {
   .version = VPP_BUILD_VER,
diff --git a/src/plugins/quic/quic.h b/src/plugins/quic/quic.h
index 5921f3a..98f4ce8 100644
--- a/src/plugins/quic/quic.h
+++ b/src/plugins/quic/quic.h
@@ -24,6 +24,9 @@
 
 #include <quicly.h>
 
+#include <vnet/crypto/crypto.h>
+#include <vppinfra/lock.h>
+
 /* QUIC log levels
  * 1 - errors
  * 2 - connection/stream events
@@ -42,8 +45,11 @@
 #define QUIC_SEND_PACKET_VEC_SIZE 16
 #define QUIC_IV_LEN 17
 
+#define QUIC_MAX_COALESCED_PACKET 4
+
 #define QUIC_SEND_MAX_BATCH_PACKETS 16
 #define QUIC_RCV_MAX_BATCH_PACKETS 16
+
 #define QUIC_DEFAULT_CONN_TIMEOUT (30 * 1000)	/* 30 seconds */
 
 /* Taken from quicly.c */
@@ -62,6 +68,10 @@
 #define QUIC_APP_ACCEPT_NOTIFY_ERROR QUICLY_ERROR_FROM_APPLICATION_ERROR_CODE(0x2)
 #define QUIC_APP_CONNECT_NOTIFY_ERROR QUICLY_ERROR_FROM_APPLICATION_ERROR_CODE(0x3)
 
+#define QUIC_DECRYPT_PACKET_OK 0
+#define QUIC_DECRYPT_PACKET_NOTOFFLOADED 1
+#define QUIC_DECRYPT_PACKET_ERROR 2
+
 #if QUIC_DEBUG
 #define QUIC_DBG(_lvl, _fmt, _args...)   \
   if (_lvl <= QUIC_DEBUG)                \
@@ -156,6 +166,14 @@
   u32 crypto_engine;
   u32 crypto_context_index;
   u8 flags;
+
+  struct
+  {
+    ptls_cipher_context_t *hp_ctx;
+    ptls_aead_context_t *aead_ctx;
+  } ingress_keys;
+  int key_phase_ingress;
+
 } quic_ctx_t;
 
 /* Make sure our custom fields don't overlap with the fields we use in
@@ -191,6 +209,25 @@
   ptls_context_t ptls_ctx;
 } quic_crypto_context_data_t;
 
+typedef struct quic_encrypt_cb_ctx_
+{
+  quicly_datagram_t *packet;
+  struct quic_finalize_send_packet_cb_ctx_
+  {
+    size_t payload_from;
+    size_t first_byte_at;
+    ptls_cipher_context_t *hp;
+  } snd_ctx[QUIC_MAX_COALESCED_PACKET];
+  size_t snd_ctx_count;
+} quic_encrypt_cb_ctx;
+
+typedef struct quic_crypto_batch_ctx_
+{
+  vnet_crypto_op_t aead_crypto_tx_packets_ops[QUIC_SEND_MAX_BATCH_PACKETS],
+    aead_crypto_rx_packets_ops[QUIC_RCV_MAX_BATCH_PACKETS];
+  size_t nb_tx_packets, nb_rx_packets;
+} quic_crypto_batch_ctx_t;
+
 typedef struct quic_worker_ctx_
 {
   CLIB_CACHE_LINE_ALIGN_MARK (cacheline0);
@@ -199,6 +236,7 @@
   quicly_cid_plaintext_t next_cid;
   crypto_context_t *crypto_ctx_pool;		/**< per thread pool of crypto contexes */
   clib_bihash_24_8_t crypto_context_hash;	/**< per thread [params:crypto_ctx_index] hash */
+  quic_crypto_batch_ctx_t crypto_context_batch;
 } quic_worker_ctx_t;
 
 typedef struct quic_rx_packet_ctx_
@@ -228,6 +266,7 @@
   ptls_cipher_suite_t ***quic_ciphers;	/**< available ciphers by crypto engine */
   uword *available_crypto_engines;	/**< Bitmap for registered engines */
   u8 default_crypto_engine;		/**< Used if you do connect with CRYPTO_ENGINE_NONE (0) */
+  u64 max_packets_per_key;		/**< number of packets that can be sent without a key update */
 
   ptls_handshake_properties_t hs_properties;
   quic_session_cache_t session_cache;
@@ -235,6 +274,8 @@
   u32 udp_fifo_size;
   u32 udp_fifo_prealloc;
   u32 connection_timeout;
+
+  clib_rwlock_t crypto_keys_quic_rw_lock;
 } quic_main_t;
 
 #endif /* __included_quic_h__ */
diff --git a/src/plugins/quic/quic_crypto.c b/src/plugins/quic/quic_crypto.c
index ca2eace..b644bed 100644
--- a/src/plugins/quic/quic_crypto.c
+++ b/src/plugins/quic/quic_crypto.c
@@ -12,14 +12,19 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
-#include <quic/quic_crypto.h>
-#include <quic/quic.h>
-
 #include <vnet/crypto/crypto.h>
+#include <vppinfra/lock.h>
 
-#include <picotls/openssl.h>
+#include <quic/quic.h>
+#include <quic/quic_crypto.h>
+
 #include <quicly.h>
+#include <picotls/openssl.h>
+
+#define QUICLY_EPOCH_1RTT 3
+
+extern quic_main_t quic_main;
+extern quic_ctx_t *quic_get_conn_ctx (quicly_conn_t * conn);
 
 typedef void (*quicly_do_transform_fn) (ptls_cipher_context_t *, void *,
 					const void *, size_t);
@@ -38,13 +43,302 @@
   u32 key_index;
 };
 
+static size_t
+quic_crypto_offload_aead_decrypt (quic_ctx_t * qctx,
+				  ptls_aead_context_t * _ctx, void *_output,
+				  const void *input, size_t inlen,
+				  uint64_t decrypted_pn, const void *aad,
+				  size_t aadlen);
+
 vnet_crypto_main_t *cm = &crypto_main;
 
+void
+quic_crypto_batch_tx_packets (quic_crypto_batch_ctx_t * batch_ctx)
+{
+  vlib_main_t *vm = vlib_get_main ();
+
+  if (batch_ctx->nb_tx_packets <= 0)
+    return;
+
+  clib_rwlock_reader_lock (&quic_main.crypto_keys_quic_rw_lock);
+  vnet_crypto_process_ops (vm, batch_ctx->aead_crypto_tx_packets_ops,
+			   batch_ctx->nb_tx_packets);
+  clib_rwlock_reader_unlock (&quic_main.crypto_keys_quic_rw_lock);
+
+  for (int i = 0; i < batch_ctx->nb_tx_packets; i++)
+    clib_mem_free (batch_ctx->aead_crypto_tx_packets_ops[i].iv);
+
+  batch_ctx->nb_tx_packets = 0;
+}
+
+void
+quic_crypto_batch_rx_packets (quic_crypto_batch_ctx_t * batch_ctx)
+{
+  vlib_main_t *vm = vlib_get_main ();
+
+  if (batch_ctx->nb_rx_packets <= 0)
+    return;
+
+  clib_rwlock_reader_lock (&quic_main.crypto_keys_quic_rw_lock);
+  vnet_crypto_process_ops (vm, batch_ctx->aead_crypto_rx_packets_ops,
+			   batch_ctx->nb_rx_packets);
+  clib_rwlock_reader_unlock (&quic_main.crypto_keys_quic_rw_lock);
+
+  for (int i = 0; i < batch_ctx->nb_rx_packets; i++)
+    clib_mem_free (batch_ctx->aead_crypto_rx_packets_ops[i].iv);
+
+  batch_ctx->nb_rx_packets = 0;
+}
+
+void
+build_iv (ptls_aead_context_t * ctx, uint8_t * iv, uint64_t seq)
+{
+  size_t iv_size = ctx->algo->iv_size, i;
+  const uint8_t *s = ctx->static_iv;
+  uint8_t *d = iv;
+  /* build iv */
+  for (i = iv_size - 8; i != 0; --i)
+    *d++ = *s++;
+  i = 64;
+  do
+    {
+      i -= 8;
+      *d++ = *s++ ^ (uint8_t) (seq >> i);
+    }
+  while (i != 0);
+}
+
+static void
+do_finalize_send_packet (ptls_cipher_context_t * hp,
+			 quicly_datagram_t * packet,
+			 size_t first_byte_at, size_t payload_from)
+{
+  uint8_t hpmask[1 + QUICLY_SEND_PN_SIZE] = {
+    0
+  };
+  size_t i;
+
+  ptls_cipher_init (hp,
+		    packet->data.base + payload_from - QUICLY_SEND_PN_SIZE +
+		    QUICLY_MAX_PN_SIZE);
+  ptls_cipher_encrypt (hp, hpmask, hpmask, sizeof (hpmask));
+
+  packet->data.base[first_byte_at] ^=
+    hpmask[0] &
+    (QUICLY_PACKET_IS_LONG_HEADER (packet->data.base[first_byte_at]) ? 0xf :
+     0x1f);
+
+  for (i = 0; i != QUICLY_SEND_PN_SIZE; ++i)
+    packet->data.base[payload_from + i - QUICLY_SEND_PN_SIZE] ^=
+      hpmask[i + 1];
+}
+
+void
+quic_crypto_finalize_send_packet (quicly_datagram_t * packet)
+{
+  quic_encrypt_cb_ctx *encrypt_cb_ctx =
+    (quic_encrypt_cb_ctx *) ((uint8_t *) packet + sizeof (*packet));
+
+  for (int i = 0; i < encrypt_cb_ctx->snd_ctx_count; i++)
+    {
+      do_finalize_send_packet (encrypt_cb_ctx->snd_ctx[i].hp,
+			       packet,
+			       encrypt_cb_ctx->snd_ctx[i].first_byte_at,
+			       encrypt_cb_ctx->snd_ctx[i].payload_from);
+    }
+  encrypt_cb_ctx->snd_ctx_count = 0;
+}
+
+static int
+quic_crypto_setup_cipher (quicly_crypto_engine_t * engine,
+			  quicly_conn_t * conn, size_t epoch, int is_enc,
+			  ptls_cipher_context_t ** hp_ctx,
+			  ptls_aead_context_t ** aead_ctx,
+			  ptls_aead_algorithm_t * aead,
+			  ptls_hash_algorithm_t * hash, const void *secret)
+{
+  uint8_t hpkey[PTLS_MAX_SECRET_SIZE];
+  int ret;
+
+  if (hp_ctx != NULL)
+    *hp_ctx = NULL;
+  *aead_ctx = NULL;
+
+  /* generate new header protection key */
+  if (hp_ctx != NULL)
+    {
+      if ((ret =
+	   ptls_hkdf_expand_label (hash, hpkey, aead->ctr_cipher->key_size,
+				   ptls_iovec_init (secret,
+						    hash->digest_size),
+				   "quic hp", ptls_iovec_init (NULL, 0),
+				   NULL)) != 0)
+	goto Exit;
+      if ((*hp_ctx =
+	   ptls_cipher_new (aead->ctr_cipher, is_enc, hpkey)) == NULL)
+	{
+	  ret = PTLS_ERROR_NO_MEMORY;
+	  goto Exit;
+	}
+    }
+
+  /* generate new AEAD context */
+  if ((*aead_ctx =
+       ptls_aead_new (aead, hash, is_enc, secret,
+		      QUICLY_AEAD_BASE_LABEL)) == NULL)
+    {
+      ret = PTLS_ERROR_NO_MEMORY;
+      goto Exit;
+    }
+
+  if (epoch == QUICLY_EPOCH_1RTT && !is_enc)
+    {
+      quic_ctx_t *qctx = quic_get_conn_ctx (conn);
+      if (qctx->ingress_keys.aead_ctx != NULL)
+	{
+	  qctx->key_phase_ingress++;
+	}
+
+      qctx->ingress_keys.aead_ctx = *aead_ctx;
+      if (hp_ctx != NULL)
+	qctx->ingress_keys.hp_ctx = *hp_ctx;
+    }
+
+  ret = 0;
+
+Exit:
+  if (ret != 0)
+    {
+      if (*aead_ctx != NULL)
+	{
+	  ptls_aead_free (*aead_ctx);
+	  *aead_ctx = NULL;
+	}
+      if (*hp_ctx != NULL)
+	{
+	  ptls_cipher_free (*hp_ctx);
+	  *hp_ctx = NULL;
+	}
+    }
+  ptls_clear_memory (hpkey, sizeof (hpkey));
+  return ret;
+}
+
+void
+quic_crypto_finalize_send_packet_cb (struct st_quicly_crypto_engine_t
+				     *engine, quicly_conn_t * conn,
+				     ptls_cipher_context_t * hp,
+				     ptls_aead_context_t * aead,
+				     quicly_datagram_t * packet,
+				     size_t first_byte_at,
+				     size_t payload_from, int coalesced)
+{
+  quic_encrypt_cb_ctx *encrypt_cb_ctx =
+    (quic_encrypt_cb_ctx *) ((uint8_t *) packet + sizeof (*packet));
+
+  encrypt_cb_ctx->snd_ctx[encrypt_cb_ctx->snd_ctx_count].hp = hp;
+  encrypt_cb_ctx->snd_ctx[encrypt_cb_ctx->snd_ctx_count].first_byte_at =
+    first_byte_at;
+  encrypt_cb_ctx->snd_ctx[encrypt_cb_ctx->snd_ctx_count].payload_from =
+    payload_from;
+  encrypt_cb_ctx->snd_ctx_count++;
+}
+
+void
+quic_crypto_decrypt_packet (quic_ctx_t * qctx, quic_rx_packet_ctx_t * pctx)
+{
+  ptls_cipher_context_t *header_protection = NULL;
+  ptls_aead_context_t *aead = NULL;
+  int pn;
+
+  /* Long Header packets are not decrypted by vpp */
+  if (QUICLY_PACKET_IS_LONG_HEADER (pctx->packet.octets.base[0]))
+    return;
+
+  uint64_t next_expected_packet_number =
+    quicly_get_next_expected_packet_number (qctx->conn);
+  if (next_expected_packet_number == UINT64_MAX)
+    return;
+
+  aead = qctx->ingress_keys.aead_ctx;
+  header_protection = qctx->ingress_keys.hp_ctx;
+
+  if (!aead || !header_protection)
+    return;
+
+  size_t encrypted_len = pctx->packet.octets.len - pctx->packet.encrypted_off;
+  uint8_t hpmask[5] = { 0 };
+  uint32_t pnbits = 0;
+  size_t pnlen, ptlen, i;
+
+  /* decipher the header protection, as well as obtaining pnbits, pnlen */
+  if (encrypted_len < header_protection->algo->iv_size + QUICLY_MAX_PN_SIZE)
+    return;
+  ptls_cipher_init (header_protection,
+		    pctx->packet.octets.base + pctx->packet.encrypted_off +
+		    QUICLY_MAX_PN_SIZE);
+  ptls_cipher_encrypt (header_protection, hpmask, hpmask, sizeof (hpmask));
+  pctx->packet.octets.base[0] ^=
+    hpmask[0] & (QUICLY_PACKET_IS_LONG_HEADER (pctx->packet.octets.base[0]) ?
+		 0xf : 0x1f);
+  pnlen = (pctx->packet.octets.base[0] & 0x3) + 1;
+  for (i = 0; i != pnlen; ++i)
+    {
+      pctx->packet.octets.base[pctx->packet.encrypted_off + i] ^=
+	hpmask[i + 1];
+      pnbits =
+	(pnbits << 8) | pctx->packet.octets.base[pctx->packet.encrypted_off +
+						 i];
+    }
+
+  size_t aead_off = pctx->packet.encrypted_off + pnlen;
+
+  pn =
+    quicly_determine_packet_number (pnbits, pnlen * 8,
+				    next_expected_packet_number);
+
+  int key_phase_bit =
+    (pctx->packet.octets.base[0] & QUICLY_KEY_PHASE_BIT) != 0;
+
+  if (key_phase_bit != (qctx->key_phase_ingress & 1))
+    {
+      pctx->packet.octets.base[0] ^=
+	hpmask[0] &
+	(QUICLY_PACKET_IS_LONG_HEADER (pctx->packet.octets.base[0]) ? 0xf :
+	 0x1f);
+      for (i = 0; i != pnlen; ++i)
+	{
+	  pctx->packet.octets.base[pctx->packet.encrypted_off + i] ^=
+	    hpmask[i + 1];
+	}
+      return;
+    }
+
+  if ((ptlen =
+       quic_crypto_offload_aead_decrypt (qctx, aead,
+					 pctx->packet.octets.base + aead_off,
+					 pctx->packet.octets.base + aead_off,
+					 pctx->packet.octets.len - aead_off,
+					 pn, pctx->packet.octets.base,
+					 aead_off)) == SIZE_MAX)
+    {
+      fprintf (stderr,
+	       "%s: aead decryption failure (pn: %d)\n", __FUNCTION__, pn);
+      return;
+    }
+
+  pctx->packet.encrypted_off = aead_off;
+  pctx->packet.octets.len = ptlen + aead_off;
+
+  pctx->packet.decrypted.pn = pn;
+  pctx->packet.decrypted.key_phase = qctx->key_phase_ingress;
+}
+
+#ifdef QUIC_HP_CRYPTO
 static void
 quic_crypto_cipher_do_init (ptls_cipher_context_t * _ctx, const void *iv)
 {
   struct cipher_context_t *ctx = (struct cipher_context_t *) _ctx;
-
   vnet_crypto_op_id_t id;
   if (!strcmp (ctx->super.algo->name, "AES128-CTR"))
     {
@@ -60,7 +354,6 @@
 		_ctx->algo->name);
       assert (0);
     }
-
   vnet_crypto_op_init (&ctx->op, id);
   ctx->op.iv = (u8 *) iv;
   ctx->op.key_index = ctx->key_index;
@@ -121,25 +414,30 @@
 }
 
 static int
-aes128ctr_setup_crypto (ptls_cipher_context_t * ctx, int is_enc,
-			const void *key)
+quic_crypto_aes128ctr_setup_crypto (ptls_cipher_context_t * ctx, int is_enc,
+				    const void *key)
 {
   return quic_crypto_cipher_setup_crypto (ctx, 1, key, EVP_aes_128_ctr (),
 					  quic_crypto_cipher_encrypt);
 }
 
 static int
-aes256ctr_setup_crypto (ptls_cipher_context_t * ctx, int is_enc,
-			const void *key)
+quic_crypto_aes256ctr_setup_crypto (ptls_cipher_context_t * ctx, int is_enc,
+				    const void *key)
 {
   return quic_crypto_cipher_setup_crypto (ctx, 1, key, EVP_aes_256_ctr (),
 					  quic_crypto_cipher_encrypt);
 }
 
+#endif // QUIC_HP_CRYPTO
+
 void
 quic_crypto_aead_encrypt_init (ptls_aead_context_t * _ctx, const void *iv,
 			       const void *aad, size_t aadlen)
 {
+  quic_main_t *qm = &quic_main;
+  u32 thread_index = vlib_get_thread_index ();
+
   struct aead_crypto_context_t *ctx = (struct aead_crypto_context_t *) _ctx;
 
   vnet_crypto_op_id_t id;
@@ -156,11 +454,18 @@
       assert (0);
     }
 
-  vnet_crypto_op_init (&ctx->op, id);
-  ctx->op.aad = (u8 *) aad;
-  ctx->op.aad_len = aadlen;
-  ctx->op.iv = (u8 *) iv;
-  ctx->op.key_index = ctx->key_index;
+  quic_crypto_batch_ctx_t *quic_crypto_batch_ctx =
+    &qm->wrk_ctx[thread_index].crypto_context_batch;
+
+  vnet_crypto_op_t *vnet_op =
+    &quic_crypto_batch_ctx->aead_crypto_tx_packets_ops
+    [quic_crypto_batch_ctx->nb_tx_packets];
+  vnet_crypto_op_init (vnet_op, id);
+  vnet_op->aad = (u8 *) aad;
+  vnet_op->aad_len = aadlen;
+  vnet_op->iv = clib_mem_alloc (PTLS_MAX_IV_SIZE);
+  clib_memcpy (vnet_op->iv, iv, PTLS_MAX_IV_SIZE);
+  vnet_op->key_index = ctx->key_index;
 }
 
 size_t
@@ -169,11 +474,20 @@
 {
   struct aead_crypto_context_t *ctx = (struct aead_crypto_context_t *) _ctx;
 
-  ctx->op.src = (u8 *) input;
-  ctx->op.dst = output;
-  ctx->op.len = inlen;
-  ctx->op.tag_len = ctx->super.algo->tag_size;
-  ctx->op.tag = ctx->op.src + inlen;
+  quic_main_t *qm = &quic_main;
+  u32 thread_index = vlib_get_thread_index ();
+  quic_crypto_batch_ctx_t *quic_crypto_batch_ctx =
+    &qm->wrk_ctx[thread_index].crypto_context_batch;
+
+  vnet_crypto_op_t *vnet_op =
+    &quic_crypto_batch_ctx->aead_crypto_tx_packets_ops
+    [quic_crypto_batch_ctx->nb_tx_packets];
+  vnet_op->src = (u8 *) input;
+  vnet_op->dst = output;
+  vnet_op->len = inlen;
+  vnet_op->tag_len = ctx->super.algo->tag_size;
+
+  vnet_op->tag = vnet_op->src + inlen;
 
   return 0;
 }
@@ -181,12 +495,16 @@
 size_t
 quic_crypto_aead_encrypt_final (ptls_aead_context_t * _ctx, void *output)
 {
-  vlib_main_t *vm = vlib_get_main ();
-  struct aead_crypto_context_t *ctx = (struct aead_crypto_context_t *) _ctx;
+  quic_main_t *qm = &quic_main;
+  u32 thread_index = vlib_get_thread_index ();
+  quic_crypto_batch_ctx_t *quic_crypto_batch_ctx =
+    &qm->wrk_ctx[thread_index].crypto_context_batch;
 
-  vnet_crypto_process_ops (vm, &ctx->op, 1);
-
-  return ctx->op.len + ctx->op.tag_len;
+  vnet_crypto_op_t *vnet_op =
+    &quic_crypto_batch_ctx->
+    aead_crypto_tx_packets_ops[quic_crypto_batch_ctx->nb_tx_packets];
+  quic_crypto_batch_ctx->nb_tx_packets++;
+  return vnet_op->len + vnet_op->tag_len;
 }
 
 size_t
@@ -226,9 +544,57 @@
 
   vnet_crypto_process_ops (vm, &ctx->op, 1);
 
+  if (ctx->op.status != VNET_CRYPTO_OP_STATUS_COMPLETED)
+    return SIZE_MAX;
+
   return ctx->op.len;
 }
 
+static size_t
+quic_crypto_offload_aead_decrypt (quic_ctx_t * qctx,
+				  ptls_aead_context_t * _ctx, void *_output,
+				  const void *input, size_t inlen,
+				  uint64_t decrypted_pn, const void *aad,
+				  size_t aadlen)
+{
+  struct aead_crypto_context_t *ctx = (struct aead_crypto_context_t *) _ctx;
+  vnet_crypto_op_id_t id;
+  if (!strcmp (ctx->super.algo->name, "AES128-GCM"))
+    {
+      id = VNET_CRYPTO_OP_AES_128_GCM_DEC;
+    }
+  else if (!strcmp (ctx->super.algo->name, "AES256-GCM"))
+    {
+      id = VNET_CRYPTO_OP_AES_256_GCM_DEC;
+    }
+  else
+    {
+      return SIZE_MAX;
+    }
+
+  quic_main_t *qm = &quic_main;
+  quic_crypto_batch_ctx_t *quic_crypto_batch_ctx =
+    &qm->wrk_ctx[qctx->c_thread_index].crypto_context_batch;
+
+  vnet_crypto_op_t *vnet_op =
+    &quic_crypto_batch_ctx->aead_crypto_rx_packets_ops
+    [quic_crypto_batch_ctx->nb_rx_packets];
+
+  vnet_crypto_op_init (vnet_op, id);
+  vnet_op->aad = (u8 *) aad;
+  vnet_op->aad_len = aadlen;
+  vnet_op->iv = clib_mem_alloc (PTLS_MAX_IV_SIZE);
+  build_iv (_ctx, vnet_op->iv, decrypted_pn);
+  vnet_op->src = (u8 *) input;
+  vnet_op->dst = _output;
+  vnet_op->key_index = ctx->key_index;
+  vnet_op->len = inlen - ctx->super.algo->tag_size;
+  vnet_op->tag_len = ctx->super.algo->tag_size;
+  vnet_op->tag = vnet_op->src + vnet_op->len;
+  quic_crypto_batch_ctx->nb_rx_packets++;
+  return vnet_op->len;
+}
+
 static void
 quic_crypto_aead_dispose_crypto (ptls_aead_context_t * _ctx)
 {
@@ -259,13 +625,16 @@
     }
 
   ctx->super.do_decrypt = quic_crypto_aead_decrypt;
+
   ctx->super.do_encrypt_init = quic_crypto_aead_encrypt_init;
   ctx->super.do_encrypt_update = quic_crypto_aead_encrypt_update;
   ctx->super.do_encrypt_final = quic_crypto_aead_encrypt_final;
   ctx->super.dispose_crypto = quic_crypto_aead_dispose_crypto;
 
+  clib_rwlock_writer_lock (&quic_main.crypto_keys_quic_rw_lock);
   ctx->key_index = vnet_crypto_key_add (vm, algo,
 					(u8 *) key, _ctx->algo->key_size);
+  clib_rwlock_writer_unlock (&quic_main.crypto_keys_quic_rw_lock);
 
   return 0;
 }
@@ -284,24 +653,28 @@
   return quic_crypto_aead_setup_crypto (ctx, is_enc, key, EVP_aes_256_gcm ());
 }
 
-ptls_cipher_algorithm_t quic_crypto_aes128ctr = { "AES128-CTR",
+#ifdef QUIC_HP_CRYPTO
+ptls_cipher_algorithm_t quic_crypto_aes128ctr = {
+  "AES128-CTR",
   PTLS_AES128_KEY_SIZE,
   1, PTLS_AES_IV_SIZE,
-  sizeof (struct cipher_context_t),
-  aes128ctr_setup_crypto
+  sizeof (struct cipher_context_t), aes128ctr_setup_crypto
 };
 
-ptls_cipher_algorithm_t quic_crypto_aes256ctr = { "AES256-CTR",
-  PTLS_AES256_KEY_SIZE,
-  1 /* block size */ ,
-  PTLS_AES_IV_SIZE,
-  sizeof (struct cipher_context_t),
-  aes256ctr_setup_crypto
+ptls_cipher_algorithm_t quic_crypto_aes256ctr = {
+  "AES256-CTR", PTLS_AES256_KEY_SIZE, 1 /* block size */ ,
+  PTLS_AES_IV_SIZE, sizeof (struct cipher_context_t), aes256ctr_setup_crypto
 };
+#endif
 
-ptls_aead_algorithm_t quic_crypto_aes128gcm = { "AES128-GCM",
+ptls_aead_algorithm_t quic_crypto_aes128gcm = {
+  "AES128-GCM",
+#ifdef QUIC_HP_CRYPTO
   &quic_crypto_aes128ctr,
-  NULL,
+#else
+  &ptls_openssl_aes128ctr,
+#endif
+  &ptls_openssl_aes128ecb,
   PTLS_AES128_KEY_SIZE,
   PTLS_AESGCM_IV_SIZE,
   PTLS_AESGCM_TAG_SIZE,
@@ -309,9 +682,14 @@
   quic_crypto_aead_aes128gcm_setup_crypto
 };
 
-ptls_aead_algorithm_t quic_crypto_aes256gcm = { "AES256-GCM",
+ptls_aead_algorithm_t quic_crypto_aes256gcm = {
+  "AES256-GCM",
+#ifdef QUIC_HP_CRYPTO
   &quic_crypto_aes256ctr,
-  NULL,
+#else
+  &ptls_openssl_aes256ctr,
+#endif
+  &ptls_openssl_aes256ecb,
   PTLS_AES256_KEY_SIZE,
   PTLS_AESGCM_IV_SIZE,
   PTLS_AESGCM_TAG_SIZE,
@@ -319,22 +697,22 @@
   quic_crypto_aead_aes256gcm_setup_crypto
 };
 
-ptls_cipher_suite_t quic_crypto_aes128gcmsha256 =
-  { PTLS_CIPHER_SUITE_AES_128_GCM_SHA256,
-  &quic_crypto_aes128gcm,
-  &ptls_openssl_sha256
+ptls_cipher_suite_t quic_crypto_aes128gcmsha256 = {
+  PTLS_CIPHER_SUITE_AES_128_GCM_SHA256,
+  &quic_crypto_aes128gcm, &ptls_openssl_sha256
 };
 
-ptls_cipher_suite_t quic_crypto_aes256gcmsha384 =
-  { PTLS_CIPHER_SUITE_AES_256_GCM_SHA384,
-  &quic_crypto_aes256gcm,
-  &ptls_openssl_sha384
+ptls_cipher_suite_t quic_crypto_aes256gcmsha384 = {
+  PTLS_CIPHER_SUITE_AES_256_GCM_SHA384,
+  &quic_crypto_aes256gcm, &ptls_openssl_sha384
 };
 
-ptls_cipher_suite_t *quic_crypto_cipher_suites[] =
-  { &quic_crypto_aes256gcmsha384,
-  &quic_crypto_aes128gcmsha256,
-  NULL
+ptls_cipher_suite_t *quic_crypto_cipher_suites[] = {
+  &quic_crypto_aes256gcmsha384, &quic_crypto_aes128gcmsha256, NULL
+};
+
+quicly_crypto_engine_t quic_crypto_engine = {
+  quic_crypto_setup_cipher, quic_crypto_finalize_send_packet_cb
 };
 
 int
diff --git a/src/plugins/quic/quic_crypto.h b/src/plugins/quic/quic_crypto.h
index ff74fac..930b31b 100644
--- a/src/plugins/quic/quic_crypto.h
+++ b/src/plugins/quic/quic_crypto.h
@@ -18,11 +18,29 @@
 
 #include <quicly.h>
 
+struct quic_ctx_t;
+struct quic_rx_packet_ctx_t;
+struct quic_crypto_batch_ctx_t;
+
 extern ptls_cipher_suite_t *quic_crypto_cipher_suites[];
 
 int quic_encrypt_ticket_cb (ptls_encrypt_ticket_t * _self, ptls_t * tls,
 			    int is_encrypt, ptls_buffer_t * dst,
 			    ptls_iovec_t src);
+void quic_crypto_decrypt_packet (quic_ctx_t * qctx,
+				 quic_rx_packet_ctx_t * pctx);
+void quic_crypto_batch_tx_packets (quic_crypto_batch_ctx_t * batch_ctx);
+void quic_crypto_batch_rx_packets (quic_crypto_batch_ctx_t * batch_ctx);
+void quic_crypto_finalize_send_packet (quicly_datagram_t * packet);
+
+void
+quic_crypto_finalize_send_packet_cb (struct st_quicly_crypto_engine_t *engine,
+				     quicly_conn_t * conn,
+				     ptls_cipher_context_t * hp,
+				     ptls_aead_context_t * aead,
+				     quicly_datagram_t * packet,
+				     size_t first_byte_at,
+				     size_t payload_from, int coalesced);
 
 #endif /* __included_vpp_quic_crypto_h__ */