vcl: basic support for epoll level-triggered evts

Type: feature

Signed-off-by: Florin Coras <fcoras@cisco.com>
Change-Id: I2d5039cd35edd02ffa2439bcac119d0647234385
diff --git a/src/vcl/vcl_private.c b/src/vcl/vcl_private.c
index 3538a09..45c208d 100644
--- a/src/vcl/vcl_private.c
+++ b/src/vcl/vcl_private.c
@@ -131,6 +131,8 @@
   hash_free (wrk->session_index_by_vpp_handles);
   vec_free (wrk->mq_events);
   vec_free (wrk->mq_msg_vector);
+  vec_free (wrk->ep_level_evts);
+  vec_free (wrk->ep_level_evts_fl);
   vcl_worker_free (wrk);
   clib_spinlock_unlock (&vcm->workers_lock);
 }
diff --git a/src/vcl/vcl_private.h b/src/vcl/vcl_private.h
index 5b19f94..ab3ecab 100644
--- a/src/vcl/vcl_private.h
+++ b/src/vcl/vcl_private.h
@@ -253,6 +253,12 @@
   /** Per worker buffer for receiving mq epoll events */
   struct epoll_event *mq_events;
 
+  /** Vector of session indices recently notified of epoll level events */
+  u32 *ep_level_evts;
+
+  /** Storage for level events session while new ones are processed */
+  u32 *ep_level_evts_fl;
+
   /** Hash table for disconnect processing */
   uword *session_index_by_vpp_handles;
 
diff --git a/src/vcl/vppcom.c b/src/vcl/vppcom.c
index d378f40..f96ceea 100644
--- a/src/vcl/vppcom.c
+++ b/src/vcl/vppcom.c
@@ -1445,6 +1445,18 @@
   return vcl_session_handle (session);
 }
 
+static void
+vcl_epoll_wait_clean_lt (vcl_worker_t *wrk, u32 sid)
+{
+  int i;
+
+  for (i = vec_len (wrk->ep_level_evts) - 1; i >= 0; i--)
+    {
+      if (wrk->ep_level_evts[i] == sid)
+	vec_del1 (wrk->ep_level_evts, i);
+    }
+}
+
 int
 vcl_session_cleanup (vcl_worker_t * wrk, vcl_session_t * s,
 		     vcl_session_handle_t sh, u8 do_disconnect)
@@ -1475,6 +1487,8 @@
 	VDBG (0, "session %u [0x%llx]: EPOLL_CTL_DEL vep_idx %u "
 	      "failed! rv %d (%s)", s->session_index, s->vpp_handle,
 	      s->vep.vep_sh, rv, vppcom_retval_str (rv));
+      if (PREDICT_FALSE (vec_len (wrk->ep_level_evts)))
+	vcl_epoll_wait_clean_lt (wrk, s->session_index);
     }
 
   if (!do_disconnect)
@@ -3063,6 +3077,10 @@
 	  s = vcl_session_get (wrk, sid);
 	  s->vep.ev.events = 0;
 	}
+      if (!(EPOLLET & session_events))
+	{
+	  vec_add1 (wrk->ep_level_evts, sid);
+	}
       *num_ev += 1;
     }
 }
@@ -3177,13 +3195,73 @@
   return 0;
 }
 
+static void
+vcl_epoll_swap_lt_lists (vcl_worker_t *wrk)
+{
+  u32 *le;
+
+  le = wrk->ep_level_evts;
+  wrk->ep_level_evts = wrk->ep_level_evts_fl;
+  wrk->ep_level_evts_fl = le;
+}
+
+static void
+vcl_epoll_wait_handle_lt (vcl_worker_t *wrk, struct epoll_event *events,
+			  int maxevents, u32 *n_evts)
+{
+  u32 *sid, add_event = 0, *le = wrk->ep_level_evts_fl;
+  vcl_session_t *s;
+  u64 evt_data;
+
+  if (*n_evts >= maxevents)
+    {
+      vec_add (wrk->ep_level_evts, le, vec_len (le));
+      vec_reset_length (wrk->ep_level_evts_fl);
+      return;
+    }
+
+  vec_foreach (sid, le)
+    {
+      s = vcl_session_get (wrk, sid[0]);
+      if (!s)
+	continue;
+      if ((s->vep.ev.events & EPOLLIN) && vcl_session_read_ready (s))
+	{
+	  add_event = 1;
+	  events[*n_evts].events |= EPOLLIN;
+	  evt_data = s->vep.ev.data.u64;
+	}
+      if ((s->vep.ev.events & EPOLLOUT) && vcl_session_write_ready (s))
+	{
+	  add_event = 1;
+	  events[*n_evts].events |= EPOLLOUT;
+	  evt_data = s->vep.ev.data.u64;
+	}
+      if (add_event)
+	{
+	  events[*n_evts].data.u64 = evt_data;
+	  *n_evts += 1;
+	  add_event = 0;
+	  vec_add1 (wrk->ep_level_evts, sid[0]);
+	  if (*n_evts == maxevents)
+	    {
+	      u32 pos = (sid - le) + 1;
+	      vec_add (wrk->ep_level_evts, &le[pos], vec_len (le) - pos);
+	      break;
+	    }
+	}
+    }
+
+  vec_reset_length (wrk->ep_level_evts_fl);
+}
+
 int
 vppcom_epoll_wait (uint32_t vep_handle, struct epoll_event *events,
 		   int maxevents, double wait_for_time)
 {
   vcl_worker_t *wrk = vcl_worker_get_current ();
   vcl_session_t *vep_session;
-  u32 n_evts = 0;
+  u32 n_evts = 0, do_lt = 0;
   int i;
 
   if (PREDICT_FALSE (maxevents <= 0))
@@ -3222,12 +3300,23 @@
   if ((int) wait_for_time == -2)
     return n_evts;
 
-  if (vcm->cfg.use_mq_eventfd)
-    return vppcom_epoll_wait_eventfd (wrk, events, maxevents, n_evts,
-				      wait_for_time);
+  if (PREDICT_FALSE (vec_len (wrk->ep_level_evts)))
+    {
+      vcl_epoll_swap_lt_lists (wrk);
+      do_lt = 1;
+    }
 
-  return vppcom_epoll_wait_condvar (wrk, events, maxevents, n_evts,
-				    wait_for_time);
+  if (vcm->cfg.use_mq_eventfd)
+    n_evts = vppcom_epoll_wait_eventfd (wrk, events, maxevents, n_evts,
+					wait_for_time);
+  else
+    n_evts = vppcom_epoll_wait_condvar (wrk, events, maxevents, n_evts,
+					wait_for_time);
+
+  if (PREDICT_FALSE (do_lt))
+    vcl_epoll_wait_handle_lt (wrk, events, maxevents, &n_evts);
+
+  return n_evts;
 }
 
 int